Bug 1620471 - Update libdav1d to 0.6.0 r=dminor

Differential Revision: https://phabricator.services.mozilla.com/D67511 --HG-- extra : moz-landing-system : lando
2020-03-20 15:17:26 +00:00 · 2020-03-20 15:17:26 +00:00 · 4ef4d2b585
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@ -2,7 +2,7 @@ This directory contains build files for dav1d. The actual library
 source is in $TOPSRCDIR/third_party/dav1d/

 Any patches or additional configuration to be applied to the
-upstream source should be kept here in the media/libaom
+upstream source should be kept here in the media/libdav1d
 directory.

 To update the library source and build config files, execute
@ -13,8 +13,35 @@ To update to a specific upstream git tag or commit, use

  ./mach vendor dav1d -r <commit>

-The upstream git repository is https://aomedia.googlesource.com/aom
+The upstream git repository is https://code.videolan.org/videolan/dav1d

 To update to a fork, use

  ./mach vendor dav1d --repo <repository url> [-r <commit>]
+
+
+The rough steps are:
+- Execute ./mach vendor dav1d -r {tag-name}  # ex: ./mach vendor dav1d -r 0.6.0
+- Update moz.build with the new files, check the
+  third_party/dav1d/src/meson.build (confirm with the diff) (note the
+  empty .asm file in x86_64)
+- Build a stand-alone libdav1d following the steps here:
+  https://code.videolan.org/videolan/dav1d#compile
+- Copy vcs_version.h from the local build/include/vcs_version.h
+  to media/libdav1d/vcs_version.h
+- Copy version.h from local build/include/dav1di/version.h to
+  media/libdav1d/version.h
+- Update dav1d.rc:
+  - update the API_VERSION_NUMBER, API_VERSION_NUMBER_STR, defines to
+    match the 'dav1d_soname_version' field in
+    third_party/dav1d/meson.build.
+  - update the PROJECT_VERSION_NUMBER, PROJECT_VERSION_NUMBER_STR
+    defines to match the new project versions from the git tag (or from
+    the project version found in third_party/dav1d/meson.build).
+- Add new options, if any, in moz.build or config.h
+
+Tips:
+- If you see build failures in build-linux64-base-toolchains (or
+  similar jobs) dav1d may now require a higher minimum nasm version
+  than our base toolchains currently support.  A bug updating the
+  minimum nasm version will probably be necessary.
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -88,11 +88,13 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
            '../../../third_party/dav1d/src/x86/loopfilter.asm',
            '../../../third_party/dav1d/src/x86/looprestoration.asm',
            '../../../third_party/dav1d/src/x86/mc.asm',
+            '../../../third_party/dav1d/src/x86/msac_init.c',
        ]

    SOURCES += [
        '../../../third_party/dav1d/src/x86/cdef_sse.asm',
        '../../../third_party/dav1d/src/x86/cpuid.asm',
+        '../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',
        '../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
        '../../../third_party/dav1d/src/x86/itx_ssse3.asm',
        '../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
@ -192,11 +194,18 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
    if CONFIG['CPU_ARCH'] == 'aarch64':
        SOURCES += [
            '../../../third_party/dav1d/src/arm/64/cdef.S',
+            '../../../third_party/dav1d/src/arm/64/cdef16.S',
+            '../../../third_party/dav1d/src/arm/64/cdef_tmpl.S',
            '../../../third_party/dav1d/src/arm/64/ipred.S',
            '../../../third_party/dav1d/src/arm/64/itx.S',
            '../../../third_party/dav1d/src/arm/64/loopfilter.S',
+            '../../../third_party/dav1d/src/arm/64/loopfilter16.S',
            '../../../third_party/dav1d/src/arm/64/looprestoration.S',
+            '../../../third_party/dav1d/src/arm/64/looprestoration16.S',
+            '../../../third_party/dav1d/src/arm/64/looprestoration_common.S',
+            '../../../third_party/dav1d/src/arm/64/looprestoration_tmpl.S',
            '../../../third_party/dav1d/src/arm/64/mc.S',
+            '../../../third_party/dav1d/src/arm/64/mc16.S',
            '../../../third_party/dav1d/src/arm/64/msac.S',
        ]
    elif CONFIG['CPU_ARCH'] == 'arm':
--- a/media/libdav1d/dav1d.rc
+++ b/media/libdav1d/dav1d.rc
@ -1,7 +1,7 @@
-#define API_VERSION_NUMBER 3,1,0,0
-#define API_VERSION_NUMBER_STR "3.1.0"
-#define PROJECT_VERSION_NUMBER 0,5,2,0
-#define PROJECT_VERSION_NUMBER_STR "0.5.2"
+#define API_VERSION_NUMBER 4,0,0,0
+#define API_VERSION_NUMBER_STR "4.0.0"
+#define PROJECT_VERSION_NUMBER 0,6,0,0
+#define PROJECT_VERSION_NUMBER_STR "0.6.0"

 #include <windows.h>

--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@ -79,6 +79,7 @@ SOURCES += [
    '../../third_party/dav1d/src/dequant_tables.c',
    '../../third_party/dav1d/src/getbits.c',
    '../../third_party/dav1d/src/intra_edge.c',
+    '../../third_party/dav1d/src/itx_1d.c',
    '../../third_party/dav1d/src/lf_mask.c',
    '../../third_party/dav1d/src/log.c',
    '../../third_party/dav1d/src/msac.c',
@ -167,6 +168,7 @@ EXPORTS.dav1d.src += [
    '../../third_party/dav1d/src/ipred.h',
    '../../third_party/dav1d/src/ipred_prepare.h',
    '../../third_party/dav1d/src/itx.h',
+    '../../third_party/dav1d/src/itx_1d.h',
    '../../third_party/dav1d/src/lf_apply.h',
    '../../third_party/dav1d/src/loopfilter.h',
    '../../third_party/dav1d/src/looprestoration.h',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,7 +20,7 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 39667c751d427e447cbe8be783cfecd296659e24 (2019-12-02T18:19:06.000+01:00).
+  release: commit efd9e5518e0ed5114f8b4579debd7ee6dbede21f (2020-03-06T00:16:53.000+01:00).

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.5.2-0-g39667c7"
+#define DAV1D_VERSION "0.6.0-0-gefd9e55"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@ -27,8 +27,8 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H

-#define DAV1D_API_VERSION_MAJOR 3
-#define DAV1D_API_VERSION_MINOR 1
+#define DAV1D_API_VERSION_MAJOR 4
+#define DAV1D_API_VERSION_MINOR 0
 #define DAV1D_API_VERSION_PATCH 0

 #endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@ -38,7 +38,7 @@ build-debian:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: build
    tags:
-        - debian
+        - avx2
        - amd64
    script:
        - meson build --buildtype release --werror
@ -173,7 +173,7 @@ build-win-arm64:

 build-debian-aarch64:
    stage: build
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
+    image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
    tags:
        - aarch64
        - debian
@ -184,7 +184,7 @@ build-debian-aarch64:

 build-debian-aarch64-clang-5:
    stage: build
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
+    image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
    tags:
        - aarch64
        - debian
@ -203,7 +203,7 @@ build-macos:
        - cd build && meson test -v

 build-debian-werror:
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
+    image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
    stage: build
    tags:
        - aarch64
@ -219,7 +219,7 @@ build-debian-armv7:
        - armv7
        - debian
    script:
-        - meson build --buildtype debugoptimized --werror
+        - linux32 meson build --buildtype debugoptimized --werror
        - ninja -C build
        - cd build && meson test -v

@ -230,13 +230,13 @@ build-debian-armv7-clang-5:
        - armv7
        - debian
    script:
-        - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
+        - env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
        - ninja -C build
        - cd build && meson test -v

 build-ubuntu-snap:
    stage: build
-    image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
+    image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
    tags:
        - debian
        - amd64
@ -292,7 +292,7 @@ test-debian-unaligned-stack:
    stage: test
    needs: ["build-debian"]
    tags:
-        - debian
+        - avx2
        - amd64
    cache:
        key: testdata.git-20190215
@ -382,7 +382,7 @@ test-win64:
    stage: test
    needs: ["build-win64"]
    tags:
-        - debian
+        - avx2
        - amd64
    cache:
        key: testdata.git-20190215
@ -403,7 +403,7 @@ test-win64:
    dependencies: []

 test-debian-aarch64:
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
+    image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
    stage: test
    needs: ["build-debian-aarch64"]
    tags:
@ -464,7 +464,7 @@ test-debian-armv7-clang-5:
        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
+        - env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
                                                               -Dtestdata_tests=true
                                                               -Dlogging=false
        - ninja -C build
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@ -1,3 +1,26 @@
+Changes for 0.6.0 'Gyrfalcon':
+------------------------------
+
+0.6.0 is a major release for dav1d:
+ - New ARM64 optimizations for the 10/12bit depth:
+    - mc_avg, mc_w_avg, mc_mask
+    - mc_put/mc_prep 8tap/bilin
+    - mc_warp_8x8
+    - mc_w_mask
+    - mc_blend
+    - wiener
+    - SGR
+    - loopfilter
+    - cdef
+ - New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
+ - New SSSE3 optimizations for film grain
+ - New AVX2 optimizations for msac_adapt16
+ - Fix rare mismatches against the reference decoder, notably because of clipping
+ - Improvements on ARM64 on msac, cdef and looprestoration optimizations
+ - Improvements on AVX2 optimizations for cdef_filter
+ - Improvements in the C version for itxfm, cdef_filter
+
+
 Changes for 0.5.2 'Asiatic Cheetah':
 ------------------------------------

@ -32,7 +55,7 @@ and improving speed significantly:
 - NEON optimizations for CDEF and warp on ARM32
 - SSE2 optimizations for MSAC hi_tok decoding
 - SSSE3 optimizations for deblocking loopfilters and warp_affine
- - AVX-2 optimizations for film grain and ipred_z2
+ - AVX2 optimizations for film grain and ipred_z2
 - SSE4 optimizations for warp_affine
 - VSX optimizations for wiener
 - Fix inverse transform overflows in x86 and NEON asm
@ -81,7 +104,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
 -----------------------------

 - Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
-   The impact is important on SSSE3, SSE4 and AVX-2 cpus
+   The impact is important on SSSE3, SSE4 and AVX2 cpus
 - SSSE3 optimizations for all blocks size in itx
 - SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
 - Speed improvements on CDEF for SSE4 CPUs
@ -93,7 +116,7 @@ Changes for 0.2.1 'Antelope':
 ----------------------------

 - SSSE3 optimization for cdef_dir
- - AVX-2 improvements of the existing CDEF optimizations
+ - AVX2 improvements of the existing CDEF optimizations
 - NEON improvements of the existing CDEF and wiener optimizations
 - Clarification about the numbering/versionning scheme

@ -103,7 +126,7 @@ Changes for 0.2.0 'Antelope':

 - ARM64 and ARM optimizations using NEON instructions
 - SSSE3 optimizations for both 32 and 64bits
- - More AVX-2 assembly, reaching almost completion
+ - More AVX2 assembly, reaching almost completion
 - Fix installation of includes
 - Rewrite inverse transforms to avoid overflows
 - Snap packaging for Linux
@ -118,6 +141,6 @@ Initial release of dav1d, the fast and small AV1 decoder.
 - Support for all features of the AV1 bitstream
 - Support for all bitdepth, 8, 10 and 12bits
 - Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
- - Full acceleration for AVX-2 64bits processors, making it the fastest decoder
+ - Full acceleration for AVX2 64bits processors, making it the fastest decoder
 - Partial acceleration for SSSE3 processors
 - Partial acceleration for NEON processors
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@ -73,7 +73,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr

 # Compile

-1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
+1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
 2. Run `mkdir build && cd build` to create a build directory and enter it
 3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
 4. Run `ninja` to compile
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@ -43,15 +43,18 @@
 #endif

 #if ARCH_X86_64
-/* x86-64 needs 32-byte alignment for AVX2. */
+/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
+#define ALIGN_64_VAL 64
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
 #elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
 /* ARM doesn't benefit from anything more than 16-byte alignment. */
+#define ALIGN_64_VAL 16
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
 #else
 /* No need for extra alignment on platforms without assembly. */
+#define ALIGN_64_VAL 8
 #define ALIGN_32_VAL 8
 #define ALIGN_16_VAL 8
 #endif
@ -76,9 +79,10 @@
 * becomes:
 * ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
 */
+#define ALIGN_STK_64(type, var, sz1d, sznd) \
+    ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
 #define ALIGN_STK_32(type, var, sz1d, sznd) \
    ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
-// as long as stack is itself 16-byte aligned, this works (win64, gcc)
 #define ALIGN_STK_16(type, var, sz1d, sznd) \
    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)

@ -92,6 +96,12 @@
 #define NOINLINE __attribute__((noinline))
 #endif /* !_MSC_VER */

+#ifdef __clang__
+#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
+#else
+#define NO_SANITIZE(x)
+#endif
+
 #if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
 #define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
 #elif defined(NDEBUG) && defined(_MSC_VER)
--- a/third_party/dav1d/include/common/bitdepth.h
+++ b/third_party/dav1d/include/common/bitdepth.h
@ -31,6 +31,8 @@
 #include <stdint.h>
 #include <string.h>

+#include "common/attributes.h"
+
 #if !defined(BITDEPTH)
 typedef void pixel;
 typedef void coef;
@ -47,12 +49,14 @@ typedef int16_t coef;
 #define iclip_pixel iclip_u8
 #define PIX_HEX_FMT "%02x"
 #define bitfn(x) x##_8bpc
-#define PXSTRIDE(x) x
+#define BF(x, suffix) x##_8bpc_##suffix
+#define PXSTRIDE(x) (x)
 #define highbd_only(x)
 #define HIGHBD_DECL_SUFFIX /* nothing */
 #define HIGHBD_CALL_SUFFIX /* nothing */
 #define HIGHBD_TAIL_SUFFIX /* nothing */
 #define bitdepth_from_max(x) 8
+#define BITDEPTH_MAX 0xff
 #elif BITDEPTH == 16
 typedef uint16_t pixel;
 typedef int32_t coef;
@ -69,8 +73,13 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) {
 #define HIGHBD_CALL_SUFFIX , f->bitdepth_max
 #define HIGHBD_TAIL_SUFFIX , bitdepth_max
 #define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
+#define BITDEPTH_MAX bitdepth_max
 #define bitfn(x) x##_16bpc
-#define PXSTRIDE(x) (x >> 1)
+#define BF(x, suffix) x##_16bpc_##suffix
+static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
+    assert(!(x & 1));
+    return x >> 1;
+}
 #define highbd_only(x) x
 #else
 #error invalid value for bitdepth
--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@ -318,8 +318,8 @@ typedef struct Dav1dFilmGrainData {
    int scaling_shift;
    int ar_coeff_lag;
    int8_t ar_coeffs_y[24];
-    int8_t ar_coeffs_uv[2][25];
-    int ar_coeff_shift;
+    int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
+    uint64_t ar_coeff_shift;
    int grain_scale_shift;
    int uv_mult[2];
    int uv_luma_mult[2];
@ -329,13 +329,13 @@ typedef struct Dav1dFilmGrainData {
 } Dav1dFilmGrainData;

 typedef struct Dav1dFrameHeader {
+    struct {
+        Dav1dFilmGrainData data;
+        int present, update;
+    } film_grain; ///< film grain parameters
    enum Dav1dFrameType frame_type; ///< type of the picture
    int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
    int frame_offset; ///< frame number
-    struct {
-        int present, update;
-        Dav1dFilmGrainData data;
-    } film_grain; ///< film grain parameters
    int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC

    int show_existing_frame;
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -23,14 +23,14 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '0.5.2',
+    version: '0.6.0',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
    meson_version: '>= 0.47.0')

-dav1d_soname_version       = '3.1.0'
+dav1d_soname_version       = '4.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -84,13 +84,15 @@ test_args = []

 optional_arguments = []

-# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
-test_args += '-D_POSIX_C_SOURCE=200112L'
-add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
-
-if host_machine.system() == 'darwin'
+if host_machine.system() == 'linux'
+    test_args += '-D_GNU_SOURCE'
+    add_project_arguments('-D_GNU_SOURCE', language: 'c')
+elif host_machine.system() == 'darwin'
    test_args += '-D_DARWIN_C_SOURCE'
    add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
+else
+    test_args += '-D_POSIX_C_SOURCE=200112L'
+    add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
 endif

 if host_machine.system() == 'windows'
@ -131,6 +133,15 @@ else
    endif
 endif

+libdl_dependency = []
+if host_machine.system() == 'linux'
+    libdl_dependency = cc.find_library('dl', required : false)
+    if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
+        cdata.set('HAVE_DLSYM', 1)
+    endif
+endif
+
+
 # Header checks

 stdatomic_dependency = []
@ -257,12 +268,12 @@ if host_machine.cpu_family().startswith('x86')
    if get_option('stack_alignment') > 0
        stack_alignment = get_option('stack_alignment')
    elif host_machine.cpu_family() == 'x86_64'
-        if cc.has_argument('-mpreferred-stack-boundary=5')
-            stackalign_flag = ['-mpreferred-stack-boundary=5']
+        if cc.has_argument('-mpreferred-stack-boundary=6')
+            stackalign_flag = ['-mpreferred-stack-boundary=6']
            stackrealign_flag = ['-mincoming-stack-boundary=4']
            stack_alignment = 32
-        elif cc.has_argument('-mstack-alignment=32')
-            stackalign_flag = ['-mstack-alignment=32']
+        elif cc.has_argument('-mstack-alignment=64')
+            stackalign_flag = ['-mstack-alignment=64']
            stackrealign_flag = ['-mstackrealign']
            stack_alignment = 32
        else
@ -364,8 +375,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')

        out = nasm_r.stdout().strip().split()
        if out[1].to_lower() == 'version'
-            if out[2].version_compare('<2.13.02')
-                error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
+            if out[2].version_compare('<2.14')
+                error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
            endif
        else
            error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
@ -390,7 +401,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
        depfile: '@BASENAME@.obj.ndep',
        arguments: [
            '-f', nasm_format,
-            '-I', '@SOURCE_DIR@/src/',
+            '-I', '@0@/src/'.format(meson.current_source_dir()),
            '-I', '@0@/'.format(meson.current_build_dir()),
            '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
            '@EXTRA_ARGS@',
--- a/third_party/dav1d/src/arm/32/cdef.S
+++ b/third_party/dav1d/src/arm/32/cdef.S
@ -148,20 +148,22 @@
 .endif
 .endm

-// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
-//                               ptrdiff_t src_stride, const pixel (*left)[2],
-//                               /*const*/ pixel *const top[2], int h,
-//                               enum CdefEdgeFlags edges);
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);

 // n1 = s0/d0
 // w1 = d0/q0
 // n2 = s4/d2
 // w2 = d2/q1
 .macro padding_func w, stride, n1, w1, n2, w2, align
-function cdef_padding\w\()_neon, export=1
+function cdef_padding\w\()_8bpc_neon, export=1
        push            {r4-r7,lr}
        ldrd            r4,  r5,  [sp, #20]
        ldr             r6,  [sp, #28]
+        cmp             r6,  #0xf // fully edged
+        beq             cdef_padding\w\()_edged_8bpc_neon
        vmov.i16        q3,  #0x8000
        tst             r6,  #4 // CDEF_HAVE_TOP
        bne             1f
@ -175,10 +177,9 @@ function cdef_padding\w\()_neon, export=1
        b               3f
 1:
        // CDEF_HAVE_TOP
-        ldr             r7,  [r4]
-        ldr             lr,  [r4, #4]
+        add             r7,  r4,  r2
        sub             r0,  r0,  #2*(2*\stride)
-        pad_top_bottom  r7,  lr,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+        pad_top_bottom  r4,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0

        // Middle section
 3:
@ -267,6 +268,65 @@ endfunc
 padding_func 8, 16, d0, q0, d2, q1, 128
 padding_func 4, 8,  s0, d0, s4, d2, 64

+// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg, align
+function cdef_padding\w\()_edged_8bpc_neon
+        sub             r0,  r0,  #(2*\stride)
+
+        ldrh            r12, [r4, #-2]
+        vldr            \reg, [r4]
+        add             r7,  r4,  r2
+        strh            r12, [r0, #-2]
+        ldrh            r12, [r4, #\w]
+        vstr            \reg, [r0]
+        strh            r12, [r0, #\w]
+
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
+        strh            r12, [r0, #\stride-2]
+        ldrh            r12, [r7, #\w]
+        vstr            \reg, [r0, #\stride]
+        strh            r12, [r0, #\stride+\w]
+        add             r0,  r0,  #2*\stride
+
+0:
+        ldrh            r12, [r3], #2
+        vldr            \reg, [r1]
+        str             r12, [r0, #-2]
+        ldrh            r12, [r1, #\w]
+        add             r1,  r1,  r2
+        subs            r5,  r5,  #1
+        vstr            \reg, [r0]
+        str             r12, [r0, #\w]
+        add             r0,  r0,  #\stride
+        bgt             0b
+
+        ldrh            r12, [r1, #-2]
+        vldr            \reg, [r1]
+        add             r7,  r1,  r2
+        strh            r12, [r0, #-2]
+        ldrh            r12, [r1, #\w]
+        vstr            \reg, [r0]
+        strh            r12, [r0, #\w]
+
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
+        strh            r12, [r0, #\stride-2]
+        ldrh            r12, [r7, #\w]
+        vstr            \reg, [r0, #\stride]
+        strh            r12, [r0, #\stride+\w]
+
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+padding_func_edged 8, 16, d0, 64
+padding_func_edged 4, 8,  s0, 32
+
 .macro dir_table w, stride
 const directions\w
        .byte           -1 * \stride + 1, -2 * \stride + 2
@ -311,14 +371,13 @@ endconst
        vld1.16         {\d22}, [r9]         // p1
 .endif
 .endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
-        cmp             \threshold, #0
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
        vmin.u16        q2,  q2,  \s1
        vmax.s16        q3,  q3,  \s1
        vmin.u16        q2,  q2,  \s2
        vmax.s16        q3,  q3,  \s2
-
-        beq             3f
+.endif
        vabd.u16        q8,  q0,  \s1        // abs(diff)
        vabd.u16        q11, q0,  \s2        // abs(diff)
        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
@ -326,7 +385,7 @@ endconst
        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
        vsub.i16        q10, \s1, q0         // diff = p0 - px
-        vsub.u16        q13, \s2, q0         // diff = p1 - px
+        vsub.i16        q13, \s2, q0         // diff = p1 - px
        vneg.s16        q8,  q9              // -clip
        vneg.s16        q11, q12             // -clip
        vmin.s16        q10, q10, q9         // imin(diff, clip)
@ -336,36 +395,44 @@ endconst
        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
-3:
 .endm

-// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
-//                              const uint16_t *tmp, int pri_strength,
-//                              int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
-        push            {r4-r9,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #92]
-        ldrd            r6,  r7,  [sp, #100]
+// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint16_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h, size_t edges);
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+        cmp             r8,  #0xf
+        beq             cdef_filter\w\suffix\()_edged_neon
+.if \pri
        movrel_local    r8,  pri_taps
        and             r9,  r3,  #1
        add             r8,  r8,  r9, lsl #1
+.endif
        movrel_local    r9,  directions\w
        add             r5,  r9,  r5, lsl #1
        vmov.u16        d17, #15
        vdup.16         d16, r6              // damping

+.if \pri
        vdup.16         q5,  r3              // threshold
+.endif
+.if \sec
        vdup.16         q7,  r4              // threshold
+.endif
        vmov.16         d8[0], r3
        vmov.16         d8[1], r4
        vclz.i16        d8,  d8              // clz(threshold)
        vsub.i16        d8,  d17, d8         // ulog2(threshold)
        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
        vneg.s16        d8,  d8              // -shift
+.if \sec
        vdup.16         q6,  d8[1]
+.endif
+.if \pri
        vdup.16         q4,  d8[0]
+.endif

 1:
 .if \w == 8
@ -377,47 +444,64 @@ function cdef_filter\w\()_neon, export=1
 .endif

        vmov.u16        q1,  #0              // sum
+.if \min
        vmov.u16        q2,  q0              // min
        vmov.u16        q3,  q0              // max
+.endif

        // Instead of loading sec_taps 2, 1 from memory, just set it
        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
        mov             lr,  #2              // sec_taps[0]

 2:
+.if \pri
        ldrsb           r9,  [r5]            // off1

        load_px         d28, d29, d30, d31, \w
+.endif

+.if \sec
        add             r5,  r5,  #4         // +2*2
        ldrsb           r9,  [r5]            // off2
+.endif

+.if \pri
        ldrb            r12, [r8]            // *pri_taps

-        handle_pixel    q14, q15, r3,  q5,  q4,  r12
+        handle_pixel    q14, q15, q5,  q4,  r12, \min
+.endif

+.if \sec
        load_px         d28, d29, d30, d31, \w

        add             r5,  r5,  #8         // +2*4
        ldrsb           r9,  [r5]            // off3

-        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+        handle_pixel    q14, q15, q7,  q6,  lr, \min

        load_px         d28, d29, d30, d31, \w

-        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+        handle_pixel    q14, q15, q7,  q6,  lr, \min

-        sub             r5,  r5,  #11        // x8 -= 2*(2+4); x8 += 1;
+        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
        subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
        add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
        bne             2b

        vshr.s16        q14, q1,  #15        // -(sum < 0)
        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
+.if \min
        vmin.s16        q0,  q0,  q3
        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
+.endif
        vmovn.u16       d0,  q0
 .if \w == 8
        add             r2,  r2,  #2*16      // tmp += tmp_stride
@ -430,9 +514,11 @@ function cdef_filter\w\()_neon, export=1
        vst1.32         {d0[1]}, [r0, :32], r1
 .endif

-        // Reset pri_taps/sec_taps back to the original point
+        // Reset pri_taps and directions back to the original point
        sub             r5,  r5,  #2
+.if \pri
        sub             r8,  r8,  #2
+.endif

        bgt             1b
        vpop            {q4-q7}
@ -440,9 +526,237 @@ function cdef_filter\w\()_neon, export=1
 endfunc
 .endm

+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_8bpc_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #92]
+        ldrd            r6,  r7,  [sp, #100]
+        ldr             r8,  [sp, #108]
+        cmp             r3,  #0 // pri_strength
+        bne             1f
+        b               cdef_filter\w\()_sec_neon // only sec
+1:
+        cmp             r4,  #0 // sec_strength
+        bne             1f
+        b               cdef_filter\w\()_pri_neon // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_neon // both pri and sec
+endfunc
+.endm
+
 filter 8
 filter 4

+.macro load_px_8 d11, d12, d21, d22, w
+.if \w == 8
+        add             r6,  r2,  r9         // x + off
+        sub             r9,  r2,  r9         // x - off
+        vld1.8          {\d11}, [r6]         // p0
+        add             r6,  r6,  #16        // += stride
+        vld1.8          {\d21}, [r9]         // p1
+        add             r9,  r9,  #16        // += stride
+        vld1.8          {\d12}, [r6]         // p0
+        vld1.8          {\d22}, [r9]         // p1
+.else
+        add             r6,  r2,  r9         // x + off
+        sub             r9,  r2,  r9         // x - off
+        vld1.32         {\d11[0]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d21[0]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d11[1]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d21[1]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d12[0]}, [r6]      // p0
+        add             r6,  r6,  #8         // += stride
+        vld1.32         {\d22[0]}, [r9]      // p1
+        add             r9,  r9,  #8         // += stride
+        vld1.32         {\d12[1]}, [r6]      // p0
+        vld1.32         {\d22[1]}, [r9]      // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+        vmin.u8         q3,  q3,  \s1
+        vmax.u8         q4,  q4,  \s1
+        vmin.u8         q3,  q3,  \s2
+        vmax.u8         q4,  q4,  \s2
+.endif
+        vabd.u8         q8,  q0,  \s1        // abs(diff)
+        vabd.u8         q11, q0,  \s2        // abs(diff)
+        vshl.u8         q9,  q8,  \shift     // abs(diff) >> shift
+        vshl.u8         q12, q11, \shift     // abs(diff) >> shift
+        vqsub.u8        q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+        vqsub.u8        q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+        vcgt.u8         q10, q0,  \s1        // px > p0
+        vcgt.u8         q13, q0,  \s2        // px > p1
+        vmin.u8         q9,  q9,  q8         // imin(abs(diff), clip)
+        vmin.u8         q12, q12, q11        // imin(abs(diff), clip)
+        vneg.s8         q8,  q9              // -imin()
+        vneg.s8         q11, q12             // -imin()
+        vbsl            q10, q8,  q9         // constrain() = imax(imin(diff, clip), -clip)
+        vdup.8          d18, \tap            // taps[k]
+        vbsl            q13, q11, q12        // constrain() = imax(imin(diff, clip), -clip)
+        vmlal.s8        q1,  d20, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q1,  d26, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q2,  d21, d18        // sum += taps[k] * constrain()
+        vmlal.s8        q2,  d27, d18        // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
+//                              const uint16_t *tmp, int pri_strength,
+//                              int sec_strength, int dir, int damping,
+//                              int h, size_t edges);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_neon
+.if \pri
+        movrel_local    r8,  pri_taps
+        and             r9,  r3,  #1
+        add             r8,  r8,  r9, lsl #1
+.endif
+        movrel_local    r9,  directions\w
+        add             r5,  r9,  r5, lsl #1
+        vmov.u8         d17, #7
+        vdup.8          d16, r6              // damping
+
+        vmov.8          d8[0], r3
+        vmov.8          d8[1], r4
+        vclz.i8         d8,  d8              // clz(threshold)
+        vsub.i8         d8,  d17, d8         // ulog2(threshold)
+        vqsub.u8        d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
+        vneg.s8         d8,  d8              // -shift
+.if \sec
+        vdup.8          q6,  d8[1]
+.endif
+.if \pri
+        vdup.8          q5,  d8[0]
+.endif
+
+1:
+.if \w == 8
+        add             r12, r2,  #16
+        vld1.8          {d0},  [r2,  :64]    // px
+        vld1.8          {d1},  [r12, :64]    // px
+.else
+        add             r12, r2,  #8
+        vld1.32         {d0[0]},  [r2,  :32] // px
+        add             r9,  r2,  #2*8
+        vld1.32         {d0[1]},  [r12, :32] // px
+        add             r12, r12, #2*8
+        vld1.32         {d1[0]},  [r9,  :32] // px
+        vld1.32         {d1[1]},  [r12, :32] // px
+.endif
+
+        vmov.u8         q1,  #0              // sum
+        vmov.u8         q2,  #0              // sum
+.if \min
+        vmov.u16        q3,  q0              // min
+        vmov.u16        q4,  q0              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             lr,  #2              // sec_taps[0]
+
+2:
+.if \pri
+        ldrsb           r9,  [r5]            // off1
+
+        load_px_8       d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+        add             r5,  r5,  #4         // +2*2
+        ldrsb           r9,  [r5]            // off2
+.endif
+
+.if \pri
+        ldrb            r12, [r8]            // *pri_taps
+        vdup.8          q7,  r3              // threshold
+
+        handle_pixel_8  q14, q15, q7,  q5,  r12, \min
+.endif
+
+.if \sec
+        load_px_8       d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #8         // +2*4
+        ldrsb           r9,  [r5]            // off3
+
+        vdup.8          q7,  r4              // threshold
+
+        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
+
+        load_px_8       d28, d29, d30, d31, \w
+
+        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
+
+        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
+        subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
+        add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
+        bne             2b
+
+        vshr.s16        q14, q1,  #15        // -(sum < 0)
+        vshr.s16        q15, q2,  #15        // -(sum < 0)
+        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
+        vadd.i16        q2,  q2,  q15        // sum - (sum < 0)
+        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
+        vrshr.s16       q2,  q2,  #4         // (8 + sum - (sum < 0)) >> 4
+        vaddw.u8        q1,  q1,  d0         // px + (8 + sum ...) >> 4
+        vaddw.u8        q2,  q2,  d1         // px + (8 + sum ...) >> 4
+        vqmovun.s16     d0,  q1
+        vqmovun.s16     d1,  q2
+.if \min
+        vmin.u8         q0,  q0,  q4
+        vmax.u8         q0,  q0,  q3         // iclip(px + .., min, max)
+.endif
+.if \w == 8
+        vst1.8          {d0}, [r0, :64], r1
+        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
+        subs            r7,  r7,  #2         // h -= 2
+        vst1.8          {d1}, [r0, :64], r1
+.else
+        vst1.32         {d0[0]}, [r0, :32], r1
+        add             r2,  r2,  #4*8       // tmp += 4*tmp_stride
+        vst1.32         {d0[1]}, [r0, :32], r1
+        subs            r7,  r7,  #4         // h -= 4
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             r5,  r5,  #2
+.if \pri
+        sub             r8,  r8,  #2
+.endif
+
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
+
 const div_table, align=4
        .short         840, 420, 280, 210, 168, 140, 120, 105
 endconst
@ -451,9 +765,9 @@ const alt_fact, align=4
        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
 endconst

-// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
-//                              unsigned *const var)
-function cdef_find_dir_neon, export=1
+// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
+//                                   unsigned *const var)
+function cdef_find_dir_8bpc_neon, export=1
        push            {lr}
        vpush           {q4-q7}
        sub             sp,  sp,  #32          // cost
--- a/third_party/dav1d/src/arm/32/loopfilter.S
+++ b/third_party/dav1d/src/arm/32/loopfilter.S
@ -143,8 +143,8 @@ function lpf_8_wd\wd\()_neon
        vaddw.s8        q1,  q1,  d4
        vmov.i8         d7,  #3
        vqmovn.s16      d2,  q1       // f
-        vqadd.s8        d4,  d6,  d2  // imin(f + 4, 128)
-        vqadd.s8        d5,  d7,  d2  // imin(f + 3, 128)
+        vqadd.s8        d4,  d6,  d2  // imin(f + 4, 127)
+        vqadd.s8        d5,  d7,  d2  // imin(f + 3, 127)
        vshr.s8         d4,  d4,  #3  // f1
        vshr.s8         d5,  d5,  #3  // f2
        vmovl.u8        q1,  d23      // p0
@ -734,13 +734,13 @@ function lpf_h_16_8_neon
        bx              r12
 endfunc

-// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
-//                            const uint32_t *const vmask,
-//                            const uint8_t (*l)[4], ptrdiff_t b4_stride,
-//                            const Av1FilterLUT *lut, const int w)
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                 const uint32_t *const vmask,
+//                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                 const Av1FilterLUT *lut, const int w)

 .macro lpf_func dir, type
-function lpf_\dir\()_sb_\type\()_neon, export=1
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4-q7}
        ldrd            r4,  r5,  [sp, #100]
--- a/third_party/dav1d/src/arm/32/looprestoration.S
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@ -28,11 +28,11 @@
 #include "src/arm/asm.S"
 #include "util.S"

-// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-//                                 const pixel *src, ptrdiff_t stride,
-//                                 const int16_t fh[7], const intptr_t w,
-//                                 int h, enum LrEdgeFlags edges);
-function wiener_filter_h_neon, export=1
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                      const pixel *src, ptrdiff_t stride,
+//                                      const int16_t fh[7], const intptr_t w,
+//                                      int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4}
        ldrd            r4,  r5,  [sp, #52]
@ -367,11 +367,11 @@ L(variable_shift_tbl):
 .purgem filter_4
 endfunc

-// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-//                                 const int16_t *mid, int w, int h,
-//                                 const int16_t fv[7], enum LrEdgeFlags edges,
-//                                 ptrdiff_t mid_stride);
-function wiener_filter_v_neon, export=1
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                      const int16_t *mid, int w, int h,
+//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
        push            {r4-r7,lr}
        ldrd            r4,  r5,  [sp, #20]
        ldrd            r6,  r7,  [sp, #28]
@ -548,9 +548,9 @@ function wiener_filter_v_neon, export=1
 .purgem filter
 endfunc

-// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-//                             const pixel *src, int w, int h);
-function copy_narrow_neon, export=1
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                  const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
        push            {r4,lr}
        ldr             r4, [sp, #8]
        adr             r12, L(copy_narrow_tbl)
@ -687,12 +687,12 @@ endfunc

 #define SUM_STRIDE (384+16)

-// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
-//                            const pixel (*left)[4],
-//                            const pixel *src, const ptrdiff_t stride,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_h_neon, export=1
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4-q7}
        ldrd            r4,  r5,  [sp, #100]
@ -925,11 +925,11 @@ L(box3_variable_shift_tbl):
        vmull.u8        q6,  d9,  d9

        add3            4
+        subs            r5,  r5,  #4
        vst1.16         {d6},  [r1,  :64]!
        vst1.16         {d14}, [r11, :64]!
        vst1.32         {q12}, [r0,  :128]!
        vst1.32         {q8},  [r10, :128]!
-        subs            r5,  r5,  #4
        ble             9f
        vext.8          q0,  q0,  q0,  #4
        vext.8          q1,  q1,  q2,  #8
@ -961,12 +961,12 @@ L(box3_variable_shift_tbl):
 .purgem add3
 endfunc

-// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
-//                            const pixel (*left)[4],
-//                            const pixel *src, const ptrdiff_t stride,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_h_neon, export=1
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4-q7}
        ldrd            r4,  r5,  [sp, #100]
@ -1038,7 +1038,7 @@ function sgr_box5_h_neon, export=1
        b               2f
 0:
        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
-        // and shift q0 to have 2x the first byte at the front.
+        // and shift q0 to have 3x the first byte at the front.
        vdup.8          q1,  d0[0]
        vdup.8          q5,  d8[0]
        // Move r3 back to account for the last 3 bytes we loaded before,
@ -1215,11 +1215,11 @@ L(box5_variable_shift_tbl):
        vmull.u8        q6,  d9,  d9

        add5            4
+        subs            r5,  r5,  #4
        vst1.16         {d6},  [r1,  :64]!
        vst1.16         {d14}, [r11, :64]!
        vst1.32         {q12}, [r0,  :128]!
        vst1.32         {q10}, [r10, :128]!
-        subs            r5,  r5,  #4
        ble             9f
        vext.8          q0,  q0,  q0,  #4
        vext.8          q1,  q1,  q2,  #8
@ -1661,11 +1661,11 @@ endfunc

 #define FILTER_OUT_STRIDE 384

-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
-//                                    const pixel *src, const ptrdiff_t stride,
-//                                    const int32_t *a, const int16_t *b,
-//                                    const int w, const int h);
-function sgr_finish_filter1_neon, export=1
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4-q7}
        ldrd            r4,  r5,  [sp, #100]
@ -1765,11 +1765,11 @@ function sgr_finish_filter1_neon, export=1
        pop             {r4-r11,pc}
 endfunc

-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
-//                                    const pixel *src, const ptrdiff_t stride,
-//                                    const int32_t *a, const int16_t *b,
-//                                    const int w, const int h);
-function sgr_finish_filter2_neon, export=1
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
        push            {r4-r11,lr}
        vpush           {q4-q7}
        ldrd            r4,  r5,  [sp, #100]
@ -1925,11 +1925,11 @@ function sgr_finish_filter2_neon, export=1
        pop             {r4-r11,pc}
 endfunc

-// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const int w, const int h,
-//                               const int wt);
-function sgr_weighted1_neon, export=1
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt);
+function sgr_weighted1_8bpc_neon, export=1
        push            {r4-r9,lr}
        ldrd            r4,  r5,  [sp, #28]
        ldrd            r6,  r7,  [sp, #36]
@ -2009,12 +2009,12 @@ function sgr_weighted1_neon, export=1
        pop             {r4-r9,pc}
 endfunc

-// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
-//                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const coef *t2,
-//                               const int w, const int h,
-//                               const int16_t wt[2]);
-function sgr_weighted2_neon, export=1
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
        push            {r4-r11,lr}
        ldrd            r4,  r5,  [sp, #36]
        ldrd            r6,  r7,  [sp, #44]
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -753,7 +753,7 @@ L(blend_v_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        vsub.i8         d5,  d22, d4
-        sub             r1,  r1,  #3
+        sub             r1,  r1,  #2
 4:
        vld1.u8         {d2},     [r2,  :64]!
        vld1.32         {d0[]},   [r0,  :32]
@ -764,10 +764,8 @@ L(blend_v_tbl):
        vrshrn.i16      d20, q3,  #6
        vst1.16         {d20[0]}, [r0,  :16]!
        vst1.16         {d20[2]}, [r12, :16]!
-        vst1.8          {d20[2]}, [r0]!
-        vst1.8          {d20[6]}, [r12]!
-        add             r0,  r0,  r1
-        add             r12, r12, r1
+        vst1.8          {d20[2]}, [r0],  r1
+        vst1.8          {d20[6]}, [r12], r1
        bgt             4b
        pop             {r4-r5,pc}
 80:
@ -776,7 +774,7 @@ L(blend_v_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        vsub.i8         d17, d16, d2
-        sub             r1,  r1,  #6
+        sub             r1,  r1,  #4
 8:
        vld1.u8         {d4,  d5},  [r2,  :128]!
        vld1.u8         {d0},  [r0,  :64]
@ -790,10 +788,8 @@ L(blend_v_tbl):
        vrshrn.i16      d23, q10, #6
        vst1.32         {d22[0]}, [r0,  :32]!
        vst1.32         {d23[0]}, [r12, :32]!
-        vst1.16         {d22[2]}, [r0,  :16]!
-        vst1.16         {d23[2]}, [r12, :16]!
-        add             r0,  r0,  r1
-        add             r12, r12, r1
+        vst1.16         {d22[2]}, [r0,  :16], r1
+        vst1.16         {d23[2]}, [r12, :16], r1
        bgt             8b
        pop             {r4-r5,pc}
 160:
@ -802,7 +798,7 @@ L(blend_v_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        vsub.i8         q11, q12, q14
-        sub             r1,  r1,  #12
+        sub             r1,  r1,  #8
 16:
        vld1.u8         {q1,  q2},  [r2,  :128]!
        vld1.u8         {q0},  [r0,  :128]
@ -822,20 +818,18 @@ L(blend_v_tbl):
        vrshrn.i16      d21, q8,  #6
        vst1.u8         {d18},    [r0,  :64]!
        vst1.u8         {d20},    [r12, :64]!
-        vst1.32         {d19[0]}, [r0,  :32]!
-        vst1.32         {d21[0]}, [r12, :32]!
-        add             r0,  r0,  r1
-        add             r12, r12, r1
+        vst1.32         {d19[0]}, [r0,  :32], r1
+        vst1.32         {d21[0]}, [r12, :32], r1
        bgt             16b
        pop             {r4-r5,pc}
 320:
        vmov.i8         q10, #64
        vld1.u8         {q2,  q3},  [r5,  :128]
        vsub.i8         q11, q10, q2
-        vsub.i8         q12, q10, q3
+        vsub.i8         d24, d20, d6
 32:
        vld1.u8         {q8,  q9},  [r2,  :128]!
-        vld1.u8         {q0,  q1},  [r0,  :128]
+        vld1.u8         {d0,  d1,  d2},  [r0,  :64]
        subs            r4,  r4,  #1
        vmull.u8        q15, d16, d4
        vmlal.u8        q15, d0,  d22
--- a/third_party/dav1d/src/arm/64/cdef.S
+++ b/third_party/dav1d/src/arm/64/cdef.S
@ -27,6 +27,7 @@

 #include "src/arm/asm.S"
 #include "util.S"
+#include "cdef_tmpl.S"

 .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
        tst             w6,  #1 // CDEF_HAVE_LEFT
@ -137,13 +138,15 @@
 .endif
 .endm

-// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
-//                               ptrdiff_t src_stride, const pixel (*left)[2],
-//                               /*const*/ pixel *const top[2], int h,
-//                               enum CdefEdgeFlags edges);
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);

 .macro padding_func w, stride, rn, rw
-function cdef_padding\w\()_neon, export=1
+function cdef_padding\w\()_8bpc_neon, export=1
+        cmp             w6,  #0xf // fully edged
+        b.eq            cdef_padding\w\()_edged_8bpc_neon
        movi            v30.8h,  #0x80, lsl #8
        mov             v31.16b, v30.16b
        sub             x0,  x0,  #2*(2*\stride+2)
@ -157,9 +160,8 @@ function cdef_padding\w\()_neon, export=1
        b               3f
 1:
        // CDEF_HAVE_TOP
-        ldr             x8,  [x4]
-        ldr             x9,  [x4, #8]
-        pad_top_bottom  x8,  x9, \w, \stride, \rn, \rw, 0
+        add             x9,  x4,  x2
+        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0

        // Middle section
 3:
@ -242,358 +244,274 @@ endfunc
 padding_func 8, 16, d, q
 padding_func 4, 8,  s, d

-.macro dir_table w, stride
-const directions\w
-        .byte           -1 * \stride + 1, -2 * \stride + 2
-        .byte            0 * \stride + 1, -1 * \stride + 2
-        .byte            0 * \stride + 1,  0 * \stride + 2
-        .byte            0 * \stride + 1,  1 * \stride + 2
-        .byte            1 * \stride + 1,  2 * \stride + 2
-        .byte            1 * \stride + 0,  2 * \stride + 1
-        .byte            1 * \stride + 0,  2 * \stride + 0
-        .byte            1 * \stride + 0,  2 * \stride - 1
-// Repeated, to avoid & 7
-        .byte           -1 * \stride + 1, -2 * \stride + 2
-        .byte            0 * \stride + 1, -1 * \stride + 2
-        .byte            0 * \stride + 1,  0 * \stride + 2
-        .byte            0 * \stride + 1,  1 * \stride + 2
-        .byte            1 * \stride + 1,  2 * \stride + 2
-        .byte            1 * \stride + 0,  2 * \stride + 1
-endconst
+// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
+//                                    ptrdiff_t src_stride, const pixel (*left)[2],
+//                                    const pixel *const top, int h,
+//                                    enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg
+function cdef_padding\w\()_edged_8bpc_neon, export=1
+        sub             x4,  x4,  #2
+        sub             x0,  x0,  #(2*\stride+2)
+
+.if \w == 4
+        ldr             d0, [x4]
+        ldr             d1, [x4, x2]
+        st1             {v0.8b, v1.8b}, [x0], #16
+.else
+        add             x9,  x4,  x2
+        ldr             d0, [x4]
+        ldr             s1, [x4, #8]
+        ldr             d2, [x9]
+        ldr             s3, [x9, #8]
+        str             d0, [x0]
+        str             s1, [x0, #8]
+        str             d2, [x0, #\stride]
+        str             s3, [x0, #\stride+8]
+        add             x0,  x0,  #2*\stride
+.endif
+
+0:
+        ld1             {v0.h}[0], [x3], #2
+        ldr             h2,      [x1, #\w]
+        load_n_incr     v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             h0,      [x0]
+        stur            \reg\()1, [x0, #2]
+        str             h2,      [x0, #2+\w]
+        add             x0,  x0,  #\stride
+        b.gt            0b
+
+        sub             x1,  x1,  #2
+.if \w == 4
+        ldr             d0, [x1]
+        ldr             d1, [x1, x2]
+        st1             {v0.8b, v1.8b}, [x0], #16
+.else
+        add             x9,  x1,  x2
+        ldr             d0, [x1]
+        ldr             s1, [x1, #8]
+        ldr             d2, [x9]
+        ldr             s3, [x9, #8]
+        str             d0, [x0]
+        str             s1, [x0, #8]
+        str             d2, [x0, #\stride]
+        str             s3, [x0, #\stride+8]
+.endif
+        ret
+endfunc
 .endm

-dir_table 8, 16
-dir_table 4, 8
+padding_func_edged 8, 16, d
+padding_func_edged 4, 8,  s

-const pri_taps
-        .byte           4, 2, 3, 3
-endconst
+tables

-.macro load_px d1, d2, w
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d1, d2, w
 .if \w == 8
-        add             x6,  x2,  w9, sxtb #1       // x + off
-        sub             x9,  x2,  w9, sxtb #1       // x - off
-        ld1             {\d1\().8h}, [x6]           // p0
-        ld1             {\d2\().8h}, [x9]           // p1
-.else
-        add             x6,  x2,  w9, sxtb #1       // x + off
-        sub             x9,  x2,  w9, sxtb #1       // x - off
-        ld1             {\d1\().4h}, [x6]           // p0
-        add             x6,  x6,  #2*8              // += stride
-        ld1             {\d2\().4h}, [x9]           // p1
-        add             x9,  x9,  #2*8              // += stride
+        add             x6,  x2,  w9, sxtb          // x + off
+        sub             x9,  x2,  w9, sxtb          // x - off
+        ld1             {\d1\().d}[0], [x6]         // p0
+        add             x6,  x6,  #16               // += stride
+        ld1             {\d2\().d}[0], [x9]         // p1
+        add             x9,  x9,  #16               // += stride
        ld1             {\d1\().d}[1], [x6]         // p0
-        ld1             {\d2\().d}[1], [x9]         // p1
+        ld1             {\d2\().d}[1], [x9]         // p0
+.else
+        add             x6,  x2,  w9, sxtb          // x + off
+        sub             x9,  x2,  w9, sxtb          // x - off
+        ld1             {\d1\().s}[0], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[0], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[1], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[1], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[2], [x6]         // p0
+        add             x6,  x6,  #8                // += stride
+        ld1             {\d2\().s}[2], [x9]         // p1
+        add             x9,  x9,  #8                // += stride
+        ld1             {\d1\().s}[3], [x6]         // p0
+        ld1             {\d2\().s}[3], [x9]         // p1
 .endif
 .endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
-        umin            v2.8h,   v2.8h,  \s1\().8h
-        smax            v3.8h,   v3.8h,  \s1\().8h
-        umin            v2.8h,   v2.8h,  \s2\().8h
-        smax            v3.8h,   v3.8h,  \s2\().8h
-
-        cbz             \threshold, 3f
-        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
-        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
-        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
-        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
-        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
-        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
-        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
-        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
-        neg             v16.8h, v17.8h              // -clip
-        neg             v20.8h, v21.8h              // -clip
-        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
-        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
-        dup             v19.8h, \tap                // taps[k]
-        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
-        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
-        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
-        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
-3:
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+        umin            v3.16b,  v3.16b,  \s1\().16b
+        umax            v4.16b,  v4.16b,  \s1\().16b
+        umin            v3.16b,  v3.16b,  \s2\().16b
+        umax            v4.16b,  v4.16b,  \s2\().16b
+.endif
+        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)
+        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)
+        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift
+        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift
+        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0
+        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1
+        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)
+        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)
+        dup             v19.16b, \tap                 // taps[k]
+        neg             v16.16b, v17.16b              // -imin()
+        neg             v20.16b, v21.16b              // -imin()
+        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
+        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
+        smlal           v1.8h,   v18.8b,  v19.8b      // sum += taps[k] * constrain()
+        smlal           v1.8h,   v22.8b,  v19.8b      // sum += taps[k] * constrain()
+        smlal2          v2.8h,   v18.16b, v19.16b     // sum += taps[k] * constrain()
+        smlal2          v2.8h,   v22.16b, v19.16b     // sum += taps[k] * constrain()
 .endm

-// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
-//                              const uint16_t *tmp, int pri_strength,
-//                              int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
+// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint8_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_8bpc_neon
+.if \pri
        movrel          x8,  pri_taps
        and             w9,  w3,  #1
        add             x8,  x8,  w9, uxtw #1
+.endif
        movrel          x9,  directions\w
        add             x5,  x9,  w5, uxtw #1
-        movi            v30.4h,   #15
-        dup             v28.4h,   w6                // damping
+        movi            v30.8b,  #7
+        dup             v28.8b,  w6                 // damping

-        dup             v25.8h, w3                  // threshold
-        dup             v27.8h, w4                  // threshold
-        trn1            v24.4h, v25.4h, v27.4h
-        clz             v24.4h, v24.4h              // clz(threshold)
-        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
-        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
-        neg             v24.4h, v24.4h              // -shift
-        dup             v26.8h, v24.h[1]
-        dup             v24.8h, v24.h[0]
+.if \pri
+        dup             v25.16b, w3                 // threshold
+.endif
+.if \sec
+        dup             v27.16b, w4                 // threshold
+.endif
+        trn1            v24.8b,  v25.8b, v27.8b
+        clz             v24.8b,  v24.8b             // clz(threshold)
+        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)
+        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.8b,  v24.8b             // -shift
+.if \sec
+        dup             v26.16b, v24.b[1]
+.endif
+.if \pri
+        dup             v24.16b, v24.b[0]
+.endif

 1:
 .if \w == 8
-        ld1             {v0.8h}, [x2]               // px
-.else
-        add             x12, x2,  #2*8
-        ld1             {v0.4h},   [x2]             // px
+        add             x12, x2,  #16
+        ld1             {v0.d}[0], [x2]             // px
        ld1             {v0.d}[1], [x12]            // px
+.else
+        add             x12, x2,  #1*8
+        add             x13, x2,  #2*8
+        add             x14, x2,  #3*8
+        ld1             {v0.s}[0], [x2]             // px
+        ld1             {v0.s}[1], [x12]            // px
+        ld1             {v0.s}[2], [x13]            // px
+        ld1             {v0.s}[3], [x14]            // px
 .endif

        movi            v1.8h,  #0                  // sum
-        mov             v2.16b, v0.16b              // min
-        mov             v3.16b, v0.16b              // max
+        movi            v2.8h,  #0                  // sum
+.if \min
+        mov             v3.16b, v0.16b              // min
+        mov             v4.16b, v0.16b              // max
+.endif

        // Instead of loading sec_taps 2, 1 from memory, just set it
        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
        mov             w11, #2                     // sec_taps[0]

 2:
+.if \pri
        ldrb            w9,  [x5]                   // off1

-        load_px         v4,  v5, \w
-
-        add             x5,  x5,  #4                // +2*2
-        ldrb            w9,  [x5]                   // off2
-        load_px         v6,  v7,  \w
-
-        ldrb            w10, [x8]                   // *pri_taps
-
-        handle_pixel    v4,  v5,  w3,  v25.8h, v24.8h, w10
-
-        add             x5,  x5,  #8                // +2*4
-        ldrb            w9,  [x5]                   // off3
-        load_px         v4,  v5,  \w
-
-        handle_pixel    v6,  v7,  w4,  v27.8h, v26.8h, w11
-
-        handle_pixel    v4,  v5,  w4,  v27.8h, v26.8h, w11
-
-        sub             x5,  x5,  #11               // x8 -= 2*(2+4); x8 += 1;
-        subs            w11, w11, #1                // sec_tap-- (value)
-        add             x8,  x8,  #1                // pri_taps++ (pointer)
-        b.ne            2b
-
-        sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
-        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
-        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
-        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
-        smin            v0.8h,  v0.8h,  v3.8h
-        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
-        xtn             v0.8b,  v0.8h
-.if \w == 8
-        add             x2,  x2,  #2*16             // tmp += tmp_stride
-        subs            w7,  w7,  #1                // h--
-        st1             {v0.8b}, [x0], x1
-.else
-        st1             {v0.s}[0], [x0], x1
-        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
-        subs            w7,  w7,  #2                // h -= 2
-        st1             {v0.s}[1], [x0], x1
+        load_px_8       v5,  v6, \w
 .endif

-        // Reset pri_taps/sec_taps back to the original point
+.if \sec
+        add             x5,  x5,  #4                // +2*2
+        ldrb            w9,  [x5]                   // off2
+        load_px_8       v28, v29, \w
+.endif
+
+.if \pri
+        ldrb            w10, [x8]                   // *pri_taps
+
+        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min
+.endif
+
+.if \sec
+        add             x5,  x5,  #8                // +2*4
+        ldrb            w9,  [x5]                   // off3
+        load_px_8       v5,  v6,  \w
+
+        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min
+
+        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min
+
+        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
+.else
+        add             x5,  x5,  #1                // x5 += 1
+.endif
+        subs            w11, w11, #1                // sec_tap-- (value)
+.if \pri
+        add             x8,  x8,  #1                // pri_taps++ (pointer)
+.endif
+        b.ne            2b
+
+        sshr            v5.8h,   v1.8h,   #15       // -(sum < 0)
+        sshr            v6.8h,   v2.8h,   #15       // -(sum < 0)
+        add             v1.8h,   v1.8h,   v5.8h     // sum - (sum < 0)
+        add             v2.8h,   v2.8h,   v6.8h     // sum - (sum < 0)
+        srshr           v1.8h,   v1.8h,   #4        // (8 + sum - (sum < 0)) >> 4
+        srshr           v2.8h,   v2.8h,   #4        // (8 + sum - (sum < 0)) >> 4
+        uaddw           v1.8h,   v1.8h,   v0.8b     // px + (8 + sum ...) >> 4
+        uaddw2          v2.8h,   v2.8h,   v0.16b    // px + (8 + sum ...) >> 4
+        sqxtun          v0.8b,   v1.8h
+        sqxtun2         v0.16b,  v2.8h
+.if \min
+        umin            v0.16b,  v0.16b,  v4.16b
+        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
+.endif
+.if \w == 8
+        st1             {v0.d}[0], [x0], x1
+        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
+        subs            w7,  w7,  #2                // h -= 2
+        st1             {v0.d}[1], [x0], x1
+.else
+        st1             {v0.s}[0], [x0], x1
+        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride
+        st1             {v0.s}[1], [x0], x1
+        subs            w7,  w7,  #4                // h -= 4
+        st1             {v0.s}[2], [x0], x1
+        st1             {v0.s}[3], [x0], x1
+.endif
+
+        // Reset pri_taps and directions back to the original point
        sub             x5,  x5,  #2
+.if \pri
        sub             x8,  x8,  #2
+.endif

        b.gt            1b
        ret
 endfunc
 .endm

-filter 8
-filter 4
-
-const div_table
-        .short         840, 420, 280, 210, 168, 140, 120, 105
-endconst
-
-const alt_fact
-        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
-endconst
-
-// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
-//                              unsigned *const var)
-function cdef_find_dir_neon, export=1
-        sub             sp,  sp,  #32 // cost
-        mov             w3,  #8
-        movi            v31.16b, #128
-        movi            v30.16b, #0
-        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
-        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
-        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
-        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
-        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
-        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
-        movi            v19.8h,  #0
-        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
-
-.irpc i, 01234567
-        ld1             {v26.8b}, [x0], x1
-        usubl           v26.8h,  v26.8b, v31.8b
-
-        addv            h25,     v26.8h               // [y]
-        rev64           v27.8h,  v26.8h
-        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
-        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
-        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
-        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
-        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
-
-.if \i == 0
-        mov             v0.16b,  v26.16b              // sum_diag[0]
-        mov             v2.16b,  v27.16b              // sum_diag[1]
-        mov             v6.16b,  v28.16b              // sum_alt[0]
-        mov             v16.16b, v29.16b              // sum_alt[1]
-.else
-        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
-        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
-        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
-        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
-        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
-        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
-        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
-        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
-        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
-        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
-        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
-        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
-        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
-        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
-        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
-        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
-.endif
-.if \i < 6
-        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
-        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
-        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
-        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
-.else
-        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
-.endif
-.if \i == 0
-        mov             v20.16b, v26.16b              // sum_alt[3]
-.elseif \i == 1
-        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
-.else
-        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
-        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
-        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
-        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
-.endif
-.endr
-
-        movi            v31.4s,  #105
-
-        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
-        smlal2          v26.4s,  v4.8h,   v4.8h
-        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
-        smlal2          v27.4s,  v5.8h,   v5.8h
-        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
-        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
-        addv            s4,  v26.4s                   // cost[2]
-        addv            s5,  v27.4s                   // cost[6]
-
-        rev64           v1.8h,   v1.8h
-        rev64           v3.8h,   v3.8h
-        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
-        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
-
-        str             s4,  [sp, #2*4]               // cost[2]
-        str             s5,  [sp, #6*4]               // cost[6]
-
-        movrel          x4,  div_table
-        ld1             {v31.8h}, [x4]
-
-        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
-        smull2          v23.4s,  v0.8h,   v0.8h
-        smlal           v22.4s,  v1.4h,   v1.4h
-        smlal2          v23.4s,  v1.8h,   v1.8h
-        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
-        smull2          v25.4s,  v2.8h,   v2.8h
-        smlal           v24.4s,  v3.4h,   v3.4h
-        smlal2          v25.4s,  v3.8h,   v3.8h
-        uxtl            v30.4s,  v31.4h               // div_table
-        uxtl2           v31.4s,  v31.8h
-        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
-        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
-        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
-        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
-        addv            s0,  v22.4s                   // cost[0]
-        addv            s2,  v24.4s                   // cost[4]
-
-        movrel          x5,  alt_fact
-        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
-
-        str             s0,  [sp, #0*4]               // cost[0]
-        str             s2,  [sp, #4*4]               // cost[4]
-
-        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
-        uxtl            v30.4s,  v30.4h
-        uxtl            v31.4s,  v31.4h
-
-.macro cost_alt d1, d2, s1, s2, s3, s4
-        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
-        smull2          v23.4s,  \s1\().8h, \s1\().8h
-        smull           v24.4s,  \s2\().4h, \s2\().4h
-        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
-        smull2          v26.4s,  \s3\().8h, \s3\().8h
-        smull           v27.4s,  \s4\().4h, \s4\().4h
-        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
-        mla             v22.4s,  v23.4s,  v30.4s
-        mla             v22.4s,  v24.4s,  v31.4s
-        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
-        mla             v25.4s,  v26.4s,  v30.4s
-        mla             v25.4s,  v27.4s,  v31.4s
-        addv            \d1, v22.4s                   // *cost_ptr
-        addv            \d2, v25.4s                   // *cost_ptr
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
 .endm
-        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
-        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
-        str             s6,  [sp, #1*4]               // cost[1]
-        str             s16, [sp, #3*4]               // cost[3]

-        mov             w0,  #0                       // best_dir
-        mov             w1,  v0.s[0]                  // best_cost
-        mov             w3,  #1                       // n
-
-        str             s18, [sp, #5*4]               // cost[5]
-        str             s20, [sp, #7*4]               // cost[7]
-
-        mov             w4,  v6.s[0]
-
-.macro find_best s1, s2, s3
-.ifnb \s2
-        mov             w5,  \s2\().s[0]
-.endif
-        cmp             w4,  w1                       // cost[n] > best_cost
-        csel            w0,  w3,  w0,  gt             // best_dir = n
-        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
-.ifnb \s2
-        add             w3,  w3,  #1                  // n++
-        cmp             w5,  w1                       // cost[n] > best_cost
-        mov             w4,  \s3\().s[0]
-        csel            w0,  w3,  w0,  gt             // best_dir = n
-        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
-        add             w3,  w3,  #1                  // n++
-.endif
-.endm
-        find_best       v6,  v4, v16
-        find_best       v16, v2, v18
-        find_best       v18, v5, v20
-        find_best       v20
-
-        eor             w3,  w0,  #4                  // best_dir ^4
-        ldr             w4,  [sp, w3, uxtw #2]
-        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
-        lsr             w1,  w1,  #10
-        str             w1,  [x2]                     // *var
-
-        add             sp,  sp,  #32
-        ret
-endfunc
+filter_8 8
+filter_8 4
--- a/third_party/dav1d/src/arm/64/cdef16.S
+++ b/third_party/dav1d/src/arm/64/cdef16.S
@ -0,0 +1,228 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        sub             \s1,  \s1,  #4
+        sub             \s2,  \s2,  #4
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             d1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             d3,       [\s2, #2*\w]
+        str             \reg\()0, [x0]
+        str             d1,       [x0, #2*\w]
+        add             x0,  x0,  #2*\stride
+        str             \reg\()2, [x0]
+        str             d3,       [x0, #2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             s1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             s3,       [\s2, #2*\w]
+        str             \reg\()0, [x0]
+        str             s1,       [x0, #2*\w]
+        str             s31,      [x0, #2*\w+4]
+        add             x0,  x0,  #2*\stride
+        str             \reg\()2, [x0]
+        str             s3,       [x0, #2*\w]
+        str             s31,      [x0, #2*\w+4]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             s1,       [\s1, #2*\w]
+        ldr             \reg\()2, [\s2]
+        ldr             s3,       [\s2, #2*\w]
+        str             s31, [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s1,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31, [x0]
+        stur            \reg\()2, [x0, #4]
+        str             s3,       [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \reg\()0, [\s1]
+        ldr             \reg\()1, [\s2]
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31,      [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr_16 dst, src, incr, w
+.if \w == 4
+        ld1             {\dst\().4h}, [\src], \incr
+.else
+        ld1             {\dst\().8h}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+//                                     ptrdiff_t src_stride, const pixel (*left)[2],
+//                                     const pixel *const top, int h,
+//                                     enum CdefEdgeFlags edges);
+
+.macro padding_func_16 w, stride, reg
+function cdef_padding\w\()_16bpc_neon, export=1
+        movi            v30.8h,  #0x80, lsl #8
+        mov             v31.16b, v30.16b
+        sub             x0,  x0,  #2*(2*\stride+2)
+        tst             w6,  #4 // CDEF_HAVE_TOP
+        b.ne            1f
+        // !CDEF_HAVE_TOP
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        add             x9,  x4,  x2
+        pad_top_bot_16  x4,  x9, \w, \stride, \reg, 0
+
+        // Middle section
+3:
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ld1             {v0.s}[0], [x3], #4
+        ldr             s2,       [x1, #2*\w]
+        load_n_incr_16  v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s0,       [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s2,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ld1             {v0.s}[0], [x3], #4
+        load_n_incr_16  v1,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s0,       [x0]
+        stur            \reg\()1, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+        b               3f
+2:
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldr             s1,       [x1, #2*\w]
+        load_n_incr_16  v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s1,       [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        load_n_incr_16  v0,  x1,  x2,  \w
+        subs            w5,  w5,  #1
+        str             s31,      [x0]
+        stur            \reg\()0, [x0, #4]
+        str             s31,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+
+3:
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
+        b.ne            1f
+        // !CDEF_HAVE_BOTTOM
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        ret
+1:
+        // CDEF_HAVE_BOTTOM
+        add             x9,  x1,  x2
+        pad_top_bot_16  x1,  x9, \w, \stride, \reg, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q
+padding_func_16 4, 8,  d
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
--- a/third_party/dav1d/src/arm/64/cdef_tmpl.S
+++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S
@ -0,0 +1,482 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d1, d2, w
+.if \w == 8
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().8h}, [x6]           // p0
+        ld1             {\d2\().8h}, [x9]           // p1
+.else
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().4h}, [x6]           // p0
+        add             x6,  x6,  #2*8              // += stride
+        ld1             {\d2\().4h}, [x9]           // p1
+        add             x9,  x9,  #2*8              // += stride
+        ld1             {\d1\().d}[1], [x6]         // p0
+        ld1             {\d2\().d}[1], [x9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+        umin            v2.8h,   v2.8h,  \s1\().8h
+        smax            v3.8h,   v3.8h,  \s1\().8h
+        umin            v2.8h,   v2.8h,  \s2\().8h
+        smax            v3.8h,   v3.8h,  \s2\().8h
+.endif
+        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
+        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
+        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
+        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
+        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
+        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
+        neg             v16.8h, v17.8h              // -clip
+        neg             v20.8h, v21.8h              // -clip
+        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
+        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
+        dup             v19.8h, \tap                // taps[k]
+        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
+        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
+        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
+        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint16_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+        ldr             w8,  [sp]                   // bitdepth_max
+        cmp             w8,  #0xf
+        b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
+.endif
+.if \pri
+.if \bpc == 16
+        ldr             w9,  [sp, #8]               // bitdepth_max
+        clz             w9,  w9
+        sub             w9,  w9,  #24               // -bitdepth_min_8
+        neg             w9,  w9                     // bitdepth_min_8
+.endif
+        movrel          x8,  pri_taps
+.if \bpc == 16
+        lsr             w9,  w3,  w9                // pri_strength >> bitdepth_min_8
+        and             w9,  w9,  #1                // (pri_strength >> bitdepth_min_8) & 1
+.else
+        and             w9,  w3,  #1
+.endif
+        add             x8,  x8,  w9, uxtw #1
+.endif
+        movrel          x9,  directions\w
+        add             x5,  x9,  w5, uxtw #1
+        movi            v30.4h,   #15
+        dup             v28.4h,   w6                // damping
+
+.if \pri
+        dup             v25.8h, w3                  // threshold
+.endif
+.if \sec
+        dup             v27.8h, w4                  // threshold
+.endif
+        trn1            v24.4h, v25.4h, v27.4h
+        clz             v24.4h, v24.4h              // clz(threshold)
+        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
+        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.4h, v24.4h              // -shift
+.if \sec
+        dup             v26.8h, v24.h[1]
+.endif
+.if \pri
+        dup             v24.8h, v24.h[0]
+.endif
+
+1:
+.if \w == 8
+        ld1             {v0.8h}, [x2]               // px
+.else
+        add             x12, x2,  #2*8
+        ld1             {v0.4h},   [x2]             // px
+        ld1             {v0.d}[1], [x12]            // px
+.endif
+
+        movi            v1.8h,  #0                  // sum
+.if \min
+        mov             v2.16b, v0.16b              // min
+        mov             v3.16b, v0.16b              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             w11, #2                     // sec_taps[0]
+
+2:
+.if \pri
+        ldrb            w9,  [x5]                   // off1
+
+        load_px         v4,  v5, \w
+.endif
+
+.if \sec
+        add             x5,  x5,  #4                // +2*2
+        ldrb            w9,  [x5]                   // off2
+        load_px         v6,  v7,  \w
+.endif
+
+.if \pri
+        ldrb            w10, [x8]                   // *pri_taps
+
+        handle_pixel    v4,  v5,  v25.8h, v24.8h, w10, \min
+.endif
+
+.if \sec
+        add             x5,  x5,  #8                // +2*4
+        ldrb            w9,  [x5]                   // off3
+        load_px         v4,  v5,  \w
+
+        handle_pixel    v6,  v7,  v27.8h, v26.8h, w11, \min
+
+        handle_pixel    v4,  v5,  v27.8h, v26.8h, w11, \min
+
+        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
+.else
+        add             x5,  x5,  #1                // x5 += 1
+.endif
+        subs            w11, w11, #1                // sec_tap-- (value)
+.if \pri
+        add             x8,  x8,  #1                // pri_taps++ (pointer)
+.endif
+        b.ne            2b
+
+        sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
+        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
+        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
+        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
+.if \min
+        smin            v0.8h,  v0.8h,  v3.8h
+        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+        xtn             v0.8b,  v0.8h
+.endif
+.if \w == 8
+        add             x2,  x2,  #2*16             // tmp += tmp_stride
+        subs            w7,  w7,  #1                // h--
+.if \bpc == 8
+        st1             {v0.8b}, [x0], x1
+.else
+        st1             {v0.8h}, [x0], x1
+.endif
+.else
+.if \bpc == 8
+        st1             {v0.s}[0], [x0], x1
+.else
+        st1             {v0.d}[0], [x0], x1
+.endif
+        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
+        subs            w7,  w7,  #2                // h -= 2
+.if \bpc == 8
+        st1             {v0.s}[1], [x0], x1
+.else
+        st1             {v0.d}[1], [x0], x1
+.endif
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             x5,  x5,  #2
+.if \pri
+        sub             x8,  x8,  #2
+.endif
+
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+        cbnz            w3,  1f // pri_strength
+        b               cdef_filter\w\()_sec_\bpc\()bpc_neon     // only sec
+1:
+        cbnz            w4,  1f // sec_strength
+        b               cdef_filter\w\()_pri_\bpc\()bpc_neon     // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v23.4s,  \s1\().8h, \s1\().8h
+        smull           v24.4s,  \s2\().4h, \s2\().4h
+        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v26.4s,  \s3\().8h, \s3\().8h
+        smull           v27.4s,  \s4\().4h, \s4\().4h
+        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v22.4s,  v23.4s,  v30.4s
+        mla             v22.4s,  v24.4s,  v31.4s
+        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v25.4s,  v26.4s,  v30.4s
+        mla             v25.4s,  v27.4s,  v31.4s
+        addv            \d1, v22.4s                   // *cost_ptr
+        addv            \d2, v25.4s                   // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        mov             w5,  \s2\().s[0]
+.endif
+        cmp             w4,  w1                       // cost[n] > best_cost
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
+.ifnb \s2
+        add             w3,  w3,  #1                  // n++
+        cmp             w5,  w1                       // cost[n] > best_cost
+        mov             w4,  \s3\().s[0]
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
+        add             w3,  w3,  #1                  // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+//                                   unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+        str             d8,  [sp, #-0x10]!
+        clz             w3,  w3                       // clz(bitdepth_max)
+        sub             w3,  w3,  #24                 // -bitdepth_min_8
+        dup             v8.8h,   w3
+.endif
+        sub             sp,  sp,  #32 // cost
+        mov             w3,  #8
+.if \bpc == 8
+        movi            v31.16b, #128
+.else
+        movi            v31.8h,  #128
+.endif
+        movi            v30.16b, #0
+        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
+        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
+        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
+        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
+        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
+        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
+        movi            v19.8h,  #0
+        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+.if \bpc == 8
+        ld1             {v26.8b}, [x0], x1
+        usubl           v26.8h,  v26.8b, v31.8b
+.else
+        ld1             {v26.8h}, [x0], x1
+        ushl            v26.8h,  v26.8h, v8.8h
+        sub             v26.8h,  v26.8h, v31.8h
+.endif
+
+        addv            h25,     v26.8h               // [y]
+        rev64           v27.8h,  v26.8h
+        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
+        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
+        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
+        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
+        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
+
+.if \i == 0
+        mov             v0.16b,  v26.16b              // sum_diag[0]
+        mov             v2.16b,  v27.16b              // sum_diag[1]
+        mov             v6.16b,  v28.16b              // sum_alt[0]
+        mov             v16.16b, v29.16b              // sum_alt[1]
+.else
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
+        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
+        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
+        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
+        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
+        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
+        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
+        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
+        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
+        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
+        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
+        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
+        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
+.endif
+.if \i < 6
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
+        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
+.else
+        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
+.endif
+.if \i == 0
+        mov             v20.16b, v26.16b              // sum_alt[3]
+.elseif \i == 1
+        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
+.else
+        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
+        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
+.endif
+.endr
+
+        movi            v31.4s,  #105
+
+        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
+        smlal2          v26.4s,  v4.8h,   v4.8h
+        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
+        smlal2          v27.4s,  v5.8h,   v5.8h
+        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
+        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
+        addv            s4,  v26.4s                   // cost[2]
+        addv            s5,  v27.4s                   // cost[6]
+
+        rev64           v1.8h,   v1.8h
+        rev64           v3.8h,   v3.8h
+        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
+
+        str             s4,  [sp, #2*4]               // cost[2]
+        str             s5,  [sp, #6*4]               // cost[6]
+
+        movrel          x4,  div_table
+        ld1             {v31.8h}, [x4]
+
+        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
+        smull2          v23.4s,  v0.8h,   v0.8h
+        smlal           v22.4s,  v1.4h,   v1.4h
+        smlal2          v23.4s,  v1.8h,   v1.8h
+        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
+        smull2          v25.4s,  v2.8h,   v2.8h
+        smlal           v24.4s,  v3.4h,   v3.4h
+        smlal2          v25.4s,  v3.8h,   v3.8h
+        uxtl            v30.4s,  v31.4h               // div_table
+        uxtl2           v31.4s,  v31.8h
+        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
+        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
+        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
+        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
+        addv            s0,  v22.4s                   // cost[0]
+        addv            s2,  v24.4s                   // cost[4]
+
+        movrel          x5,  alt_fact
+        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+        str             s0,  [sp, #0*4]               // cost[0]
+        str             s2,  [sp, #4*4]               // cost[4]
+
+        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
+        uxtl            v30.4s,  v30.4h
+        uxtl            v31.4s,  v31.4h
+
+        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
+        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
+        str             s6,  [sp, #1*4]               // cost[1]
+        str             s16, [sp, #3*4]               // cost[3]
+
+        mov             w0,  #0                       // best_dir
+        mov             w1,  v0.s[0]                  // best_cost
+        mov             w3,  #1                       // n
+
+        str             s18, [sp, #5*4]               // cost[5]
+        str             s20, [sp, #7*4]               // cost[7]
+
+        mov             w4,  v6.s[0]
+
+        find_best       v6,  v4, v16
+        find_best       v16, v2, v18
+        find_best       v18, v5, v20
+        find_best       v20
+
+        eor             w3,  w0,  #4                  // best_dir ^4
+        ldr             w4,  [sp, w3, uxtw #2]
+        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
+        lsr             w1,  w1,  #10
+        str             w1,  [x2]                     // *var
+
+        add             sp,  sp,  #32
+.if \bpc == 16
+        ldr             d8,  [sp], 0x10
+.endif
+        ret
+endfunc
+.endm
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@ -161,31 +161,6 @@ endconst
 .endif
 .endm

-.macro scale_wide sz, c, r0, r1, r2 r3, r4, r5, r6, r7
-        smull_sz        v2,  v3,  \r0, \c,  \sz
-        smull_sz        v4,  v5,  \r1, \c,  \sz
-        smull_sz        v6,  v7,  \r2, \c,  \sz
-        rshrn_sz        \r0, v2,  v3,  #12, \sz
-        smull_sz        v2,  v3,  \r3, \c,  \sz
-        rshrn_sz        \r1, v4,  v5,  #12, \sz
-.ifnb \r4
-        smull_sz        v4,  v5,  \r4, \c,  \sz
-.endif
-        rshrn_sz        \r2, v6,  v7,  #12, \sz
-.ifnb \r4
-        smull_sz        v6,  v7,  \r5, \c,  \sz
-.endif
-        rshrn_sz        \r3, v2,  v3,  #12, \sz
-.ifnb \r4
-        smull_sz        v2,  v3,  \r6, \c,  \sz
-        rshrn_sz        \r4, v4,  v5,  #12, \sz
-        smull_sz        v4,  v5,  \r7, \c,  \sz
-        rshrn_sz        \r5, v6,  v7,  #12, \sz
-        rshrn_sz        \r6, v2,  v3,  #12, \sz
-        rshrn_sz        \r7, v4,  v5,  #12, \sz
-.endif
-.endm
-
 .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
 .ifnb \load
        ld1             {\load},  [\src], x1
@ -599,41 +574,40 @@ function inv_flipadst_8x4_neon
 endfunc

 function inv_identity_4x4_neon
-        mov             w16, #5793
+        mov             w16, #(5793-4096)*8
        dup             v0.4h,   w16
-        smull           v4.4s,   v16.4h,  v0.h[0]
-        smull           v5.4s,   v17.4h,  v0.h[0]
-        smull           v6.4s,   v18.4h,  v0.h[0]
-        smull           v7.4s,   v19.4h,  v0.h[0]
-        rshrn           v16.4h,  v4.4s,   #12
-        rshrn           v17.4h,  v5.4s,   #12
-        rshrn           v18.4h,  v6.4s,   #12
-        rshrn           v19.4h,  v7.4s,   #12
+        sqrdmulh        v4.4h,   v16.4h,  v0.h[0]
+        sqrdmulh        v5.4h,   v17.4h,  v0.h[0]
+        sqrdmulh        v6.4h,   v18.4h,  v0.h[0]
+        sqrdmulh        v7.4h,   v19.4h,  v0.h[0]
+        sqadd           v16.4h,  v16.4h,  v4.4h
+        sqadd           v17.4h,  v17.4h,  v5.4h
+        sqadd           v18.4h,  v18.4h,  v6.4h
+        sqadd           v19.4h,  v19.4h,  v7.4h
        ret
 endfunc

 function inv_identity_8x4_neon
-        mov             w16, #5793
+        mov             w16, #(5793-4096)*8
        dup             v0.4h,   w16
-        smull           v2.4s,   v16.4h,  v0.h[0]
-        smull2          v3.4s,   v16.8h,  v0.h[0]
-        smull           v4.4s,   v17.4h,  v0.h[0]
-        smull2          v5.4s,   v17.8h,  v0.h[0]
-        rshrn           v16.4h,  v2.4s,   #12
-        rshrn2          v16.8h,  v3.4s,   #12
-        smull           v6.4s,   v18.4h,  v0.h[0]
-        smull2          v7.4s,   v18.8h,  v0.h[0]
-        rshrn           v17.4h,  v4.4s,   #12
-        rshrn2          v17.8h,  v5.4s,   #12
-        smull           v2.4s,   v19.4h,  v0.h[0]
-        smull2          v3.4s,   v19.8h,  v0.h[0]
-        rshrn           v18.4h,  v6.4s,   #12
-        rshrn2          v18.8h,  v7.4s,   #12
-        rshrn           v19.4h,  v2.4s,   #12
-        rshrn2          v19.8h,  v3.4s,   #12
+        sqrdmulh        v4.8h,   v16.8h,  v0.h[0]
+        sqrdmulh        v5.8h,   v17.8h,  v0.h[0]
+        sqrdmulh        v6.8h,   v18.8h,  v0.h[0]
+        sqrdmulh        v7.8h,   v19.8h,  v0.h[0]
+        sqadd           v16.8h,  v16.8h,  v4.8h
+        sqadd           v17.8h,  v17.8h,  v5.8h
+        sqadd           v18.8h,  v18.8h,  v6.8h
+        sqadd           v19.8h,  v19.8h,  v7.8h
        ret
 endfunc

+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+        sqrdmulh        v2.8h,  \i,  \c
+        srhadd          \i,     \i,  v2.8h
+.endr
+.endm
+
 function inv_txfm_add_wht_wht_4x4_neon, export=1
        mov             x15, x30
        movi            v31.8h,  #0
@ -877,30 +851,31 @@ function inv_flipadst_4x8_neon
 endfunc

 function inv_identity_8x8_neon
-        shl             v16.8h,  v16.8h,  #1
-        shl             v17.8h,  v17.8h,  #1
-        shl             v18.8h,  v18.8h,  #1
-        shl             v19.8h,  v19.8h,  #1
-        shl             v20.8h,  v20.8h,  #1
-        shl             v21.8h,  v21.8h,  #1
-        shl             v22.8h,  v22.8h,  #1
-        shl             v23.8h,  v23.8h,  #1
+        sqshl           v16.8h,  v16.8h,  #1
+        sqshl           v17.8h,  v17.8h,  #1
+        sqshl           v18.8h,  v18.8h,  #1
+        sqshl           v19.8h,  v19.8h,  #1
+        sqshl           v20.8h,  v20.8h,  #1
+        sqshl           v21.8h,  v21.8h,  #1
+        sqshl           v22.8h,  v22.8h,  #1
+        sqshl           v23.8h,  v23.8h,  #1
        ret
 endfunc

 function inv_identity_4x8_neon
-        shl             v16.4h,  v16.4h,  #1
-        shl             v17.4h,  v17.4h,  #1
-        shl             v18.4h,  v18.4h,  #1
-        shl             v19.4h,  v19.4h,  #1
-        shl             v20.4h,  v20.4h,  #1
-        shl             v21.4h,  v21.4h,  #1
-        shl             v22.4h,  v22.4h,  #1
-        shl             v23.4h,  v23.4h,  #1
+        sqshl           v16.4h,  v16.4h,  #1
+        sqshl           v17.4h,  v17.4h,  #1
+        sqshl           v18.4h,  v18.4h,  #1
+        sqshl           v19.4h,  v19.4h,  #1
+        sqshl           v20.4h,  v20.4h,  #1
+        sqshl           v21.4h,  v21.4h,  #1
+        sqshl           v22.4h,  v22.4h,  #1
+        sqshl           v23.4h,  v23.4h,  #1
        ret
 endfunc

-function inv_txfm_add_8x8_neon
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
        movi            v28.8h,  #0
        movi            v29.8h,  #0
        movi            v30.8h,  #0
@ -910,6 +885,9 @@ function inv_txfm_add_8x8_neon
        ld1             {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]

+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
        blr             x4

        srshr           v16.8h,  v16.8h,  #1
@ -920,6 +898,7 @@ function inv_txfm_add_8x8_neon
        srshr           v21.8h,  v21.8h,  #1
        srshr           v22.8h,  v22.8h,  #1
        srshr           v23.8h,  v23.8h,  #1
+.endif

        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25

@ -928,6 +907,10 @@ function inv_txfm_add_8x8_neon
        load_add_store_8x8 x0, x7
        br              x15
 endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_

 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
@ -936,9 +919,13 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
        idct_dc         8,   8,   1
 .endif
-        adr             x4,  inv_\txfm1\()_8x8_neon
        adr             x5,  inv_\txfm2\()_8x8_neon
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_8x8_neon
+.else
+        adr             x4,  inv_\txfm1\()_8x8_neon
        b               inv_txfm_add_8x8_neon
+.endif
 endfunc
 .endm

@ -1083,9 +1070,12 @@ def_fns_48 8, 4
        rshrn_sz        v27, v6,  v7,  #12, \sz                   // t14a

        smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
-        neg             v29\sz,  v29\sz
-        smull_smlsl     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+        smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
        rshrn_sz        v29, v4,  v5,  #12, \sz                   // t13a
+        neg             v6.4s,   v6.4s
+.ifc \sz, .8h
+        neg             v7.4s,   v7.4s
+.endif
        rshrn_sz        v23, v6,  v7,  #12, \sz                   // t10a

        sqsub           v2\sz,   v17\sz,  v19\sz  // t11a
@ -1333,27 +1323,59 @@ function inv_flipadst_4x16_neon
 endfunc

 function inv_identity_8x16_neon
-        mov             w16, #2*5793
+        mov             w16, #2*(5793-4096)*8
        dup             v0.4h,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        smull           v2.4s,   v\i\().4h,  v0.h[0]
-        smull2          v3.4s,   v\i\().8h,  v0.h[0]
-        rshrn           v\i\().4h,  v2.4s,   #12
-        rshrn2          v\i\().8h,  v3.4s,   #12
+        sqrdmulh        v2.8h,      v\i\().8h,  v0.h[0]
+        sqadd           v\i\().8h,  v\i\().8h,  v\i\().8h
+        sqadd           v\i\().8h,  v\i\().8h,  v2.8h
 .endr
        ret
 endfunc

 function inv_identity_4x16_neon
-        mov             w16, #2*5793
+        mov             w16, #2*(5793-4096)*8
        dup             v0.4h,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        smull           v2.4s,   v\i\().4h,  v0.h[0]
-        rshrn           v\i\().4h,  v2.4s,   #12
+        sqrdmulh        v2.4h,      v\i\().4h,  v0.h[0]
+        sqadd           v\i\().4h,  v\i\().4h,  v\i\().4h
+        sqadd           v\i\().4h,  v\i\().4h,  v2.4h
 .endr
        ret
 endfunc

+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        sshr            v2.8h,   v2.8h,   #1
+        srhadd          \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        srshr           v2.8h,   v2.8h,   #1
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        srshr           v2.8h,   v2.8h,   #1
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        sqadd           \i,      \i,      \i
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
 function inv_txfm_horz_16x8_neon
        mov             x14, x30
        movi            v7.8h,  #0
@ -1375,6 +1397,26 @@ function inv_txfm_horz_16x8_neon
        br              x14
 endfunc

+function inv_txfm_horz_identity_16x8_neon
+        mov             x14, x30
+        movi            v7.8h,  #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7]
+        st1             {v7.8h}, [x7], x8
+.endr
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x16_shift2 v0.h[0]
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+        st1             {v\i\().8h}, [x6], #16
+.endr
+
+        br              x14
+endfunc
+
 function inv_txfm_horz_scale_16x8_neon
        mov             x14, x30
        movi            v7.8h,  #0
@ -1421,7 +1463,7 @@ function inv_txfm_add_16x16_neon
 .endif
        add             x7,  x2,  #(\i*2)
        mov             x8,  #16*2
-        bl              inv_txfm_horz_16x8_neon
+        blr             x9
 .endr
        b               2f
 1:
@ -1449,7 +1491,12 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
        idct_dc         16,  16,  2
 .endif
+.ifc \txfm1, identity
+        adr             x9,  inv_txfm_horz_identity_16x8_neon
+.else
+        adr             x9,  inv_txfm_horz_16x8_neon
        adr             x4,  inv_\txfm1\()_8x16_neon
+.endif
        adr             x5,  inv_\txfm2\()_8x16_neon
        mov             x13, #\eob_half
        b               inv_txfm_add_16x16_neon
@ -1469,12 +1516,35 @@ def_fn_16x16 flipadst, adst, 36
 def_fn_16x16 flipadst, flipadst, 36
 def_fn_16x16 identity, dct, 8

-function inv_txfm_add_16x4_neon
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
        mov             x15, x30
        movi            v4.8h,  #0

-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().4h}, [x2]
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+
+        identity_8x16_shift1 v0.h[0]
+.else
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+        ld1             {\i},    [x2]
        st1             {v4.4h}, [x2], #8
 .endr

@ -1484,14 +1554,21 @@ function inv_txfm_add_16x4_neon
        ins             v17.d[1], v21.d[0]
        ins             v18.d[1], v22.d[0]
        ins             v19.d[1], v23.d[0]
-.irp i, 16, 17, 18, 19
-        srshr           v\i\().8h,  v\i\().8h,  #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        srshr           \i,  \i,  #1
 .endr
+.endif
        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
        blr             x5
        mov             x6,  x0
        load_add_store_8x4 x6, x7

+.ifc \variant, identity_
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+.else
        ins             v24.d[1], v28.d[0]
        ins             v25.d[1], v29.d[0]
        ins             v26.d[1], v30.d[0]
@ -1500,6 +1577,7 @@ function inv_txfm_add_16x4_neon
        srshr           v17.8h,  v25.8h,  #1
        srshr           v18.8h,  v26.8h,  #1
        srshr           v19.8h,  v27.8h,  #1
+.endif
        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
        blr             x5
        add             x6,  x0,  #8
@ -1508,7 +1586,7 @@ function inv_txfm_add_16x4_neon
        br              x15
 endfunc

-function inv_txfm_add_4x16_neon
+function inv_txfm_\variant\()add_4x16_neon
        mov             x15, x30
        movi            v2.8h,   #0

@ -1517,8 +1595,17 @@ function inv_txfm_add_4x16_neon
        b.lt            1f

        add             x6,  x2,  #16
-.irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x6]
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        ld1             {\i},    [x6]
+        st1             {v2.8h}, [x6], x11
+.endr
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        ld1             {\i},    [x6]
        st1             {v2.8h}, [x6], x11
 .endr
        blr             x4
@ -1526,6 +1613,7 @@ function inv_txfm_add_4x16_neon
        srshr           v25.8h,  v17.8h,  #1
        srshr           v26.8h,  v18.8h,  #1
        srshr           v27.8h,  v19.8h,  #1
+.endif
        transpose_4x8h  v24, v25, v26, v27, v4,  v5,  v6,  v7
        ins             v28.d[0], v24.d[1]
        ins             v29.d[0], v25.d[1]
@ -1534,19 +1622,25 @@ function inv_txfm_add_4x16_neon

        b               2f
 1:
-.irp i, 24, 25, 26, 27, 28, 29, 30, 31
-        movi            v\i\().4h,  #0
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+        movi            \i,  #0
 .endr
 2:
        movi            v2.8h,   #0
-.irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        ld1             {\i},    [x2]
        st1             {v2.8h}, [x2], x11
 .endr
+.ifc \variant, identity_
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
        blr             x4
-.irp i, 16, 17, 18, 19
-        srshr           v\i\().8h,  v\i\().8h,  #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        srshr           \i,  \i,  #1
 .endr
+.endif
        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
        ins             v20.d[0], v16.d[1]
        ins             v21.d[0], v17.d[1]
@ -1559,6 +1653,10 @@ function inv_txfm_add_4x16_neon

        br              x15
 endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_

 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
@ -1573,7 +1671,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
        adr             x4,  inv_\txfm1\()_4x\w\()_neon
        adr             x5,  inv_\txfm2\()_8x\h\()_neon
 .endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
 endfunc
 .endm

@ -1600,24 +1702,31 @@ def_fns_416 4, 16
 def_fns_416 16, 4


-function inv_txfm_add_16x8_neon
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
        mov             x15, x30
        movi            v4.8h,  #0
        mov             w16, #2896*8
        dup             v0.4h,   w16

-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i},    [x2]
        st1             {v4.8h}, [x2], #16
 .endr

        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x16_shift1 v0.h[0]
+.else
        blr             x4

-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        srshr           v\i\().8h,  v\i\().8h,  #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        srshr           \i,  \i,  #1
 .endr
+.endif
        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3

        blr             x5
@ -1625,6 +1734,16 @@ function inv_txfm_add_16x8_neon
        mov             x6,  x0
        load_add_store_8x8 x6, x7

+.ifc \variant, identity_
+        mov             v16.16b, v24.16b
+        mov             v17.16b, v25.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v27.16b
+        mov             v20.16b, v28.16b
+        mov             v21.16b, v29.16b
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v31.16b
+.else
        srshr           v16.8h,  v24.8h,  #1
        srshr           v17.8h,  v25.8h,  #1
        srshr           v18.8h,  v26.8h,  #1
@ -1633,6 +1752,7 @@ function inv_txfm_add_16x8_neon
        srshr           v21.8h,  v29.8h,  #1
        srshr           v22.8h,  v30.8h,  #1
        srshr           v23.8h,  v31.8h,  #1
+.endif

        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3

@ -1644,7 +1764,7 @@ function inv_txfm_add_16x8_neon
        br              x15
 endfunc

-function inv_txfm_add_8x16_neon
+function inv_txfm_\variant\()add_8x16_neon
        mov             x15, x30
        movi            v4.8h,   #0
        mov             w16, #2896*8
@ -1655,8 +1775,16 @@ function inv_txfm_add_8x16_neon
        b.lt            1f

        add             x6,  x2,  #16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x6]
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i},    [x6]
+        st1             {v4.8h}, [x6], x11
+.endr
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i},    [x6]
        st1             {v4.8h}, [x6], x11
 .endr
        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
@ -1670,13 +1798,14 @@ function inv_txfm_add_8x16_neon
        srshr           v29.8h,  v21.8h,  #1
        srshr           v30.8h,  v22.8h,  #1
        srshr           v31.8h,  v23.8h,  #1
+.endif
        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3

        b               2f

 1:
-.irp i, 24, 25, 26, 27, 28, 29, 30, 31
-        movi            v\i\().8h,  #0
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        movi            \i,  #0
 .endr

 2:
@ -1684,16 +1813,20 @@ function inv_txfm_add_8x16_neon
        mov             w16, #2896*8
        dup             v0.4h,   w16

-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        ld1             {\i},    [x2]
        st1             {v4.8h}, [x2], x11
 .endr
        scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
        blr             x4

-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        srshr           v\i\().8h,  v\i\().8h,  #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+        srshr           \i,  \i,  #1
 .endr
+.endif

        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3

@ -1703,6 +1836,10 @@ function inv_txfm_add_8x16_neon

        br              x15
 endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_

 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
@ -1714,7 +1851,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
 .if \w == 8
        mov             x13, #\eob_half
 .endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
 endfunc
 .endm

@ -2120,7 +2261,7 @@ endfunc
 .macro def_identity_1632 w, h, wshort, hshort
 function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
        mov             w16, #2896*8
-        mov             w17, #2*5793
+        mov             w17, #2*(5793-4096)*8
        dup             v1.4h,   w16
        movi            v0.8h,   #0
        mov             v1.h[1], w17
@ -2140,12 +2281,11 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1

 .if \w == 16
        // 16x32
-        scale_wide      .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
-        shift_8_regs    srshr, 1
+        identity_8x8_shift1 v1.h[1]
 .else
        // 32x16
-        shift_8_regs    shl, 1
-        scale_wide      .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+        shift_8_regs    sqshl, 1
+        identity_8x8 v1.h[1]
 .endif

        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
--- a/third_party/dav1d/src/arm/64/loopfilter.S
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@ -151,8 +151,8 @@ function lpf_16_wd\wd\()_neon
        movi            v7.16b,  #3
        sqxtn           v2.8b,   v2.8h            // f
        sqxtn2          v2.16b,  v3.8h
-        sqadd           v4.16b,  v6.16b,  v2.16b  // imin(f + 4, 128)
-        sqadd           v5.16b,  v7.16b,  v2.16b  // imin(f + 3, 128)
+        sqadd           v4.16b,  v6.16b,  v2.16b  // imin(f + 4, 127)
+        sqadd           v5.16b,  v7.16b,  v2.16b  // imin(f + 3, 127)
        sshr            v4.16b,  v4.16b,  #3      // f1
        sshr            v5.16b,  v5.16b,  #3      // f2
        uxtl            v2.8h,   v23.8b           // p0
@ -981,13 +981,13 @@ function lpf_h_16_16_neon
        br              x15
 endfunc

-// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
-//                            const uint32_t *const vmask,
-//                            const uint8_t (*l)[4], ptrdiff_t b4_stride,
-//                            const Av1FilterLUT *lut, const int w)
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                 const uint32_t *const vmask,
+//                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                 const Av1FilterLUT *lut, const int w)

 .macro lpf_func dir, type
-function lpf_\dir\()_sb_\type\()_neon, export=1
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
        mov             x11, x30
        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
--- a/third_party/dav1d/src/arm/64/loopfilter16.S
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@ -0,0 +1,907 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+        uabd            v0.8h,   v22.8h,  v23.8h  // abs(p1 - p0)
+        uabd            v1.8h,   v25.8h,  v24.8h  // abs(q1 - q0)
+        uabd            v2.8h,   v23.8h,  v24.8h  // abs(p0 - q0)
+        uabd            v3.8h,   v22.8h,  v25.8h  // abs(p1 - q1)
+.if \wd >= 6
+        uabd            v4.8h,   v21.8h,  v22.8h  // abs(p2 - p1)
+        uabd            v5.8h,   v26.8h,  v25.8h  // abs(q2 - q1)
+.endif
+.if \wd >= 8
+        uabd            v6.8h,   v20.8h,  v21.8h  // abs(p3 - p2)
+        uabd            v7.8h,   v27.8h,  v26.8h  // abs(q3 - q3)
+.endif
+.if \wd >= 6
+        umax            v4.8h,   v4.8h,   v5.8h
+.endif
+        uqadd           v2.8h,   v2.8h,   v2.8h   // abs(p0 - q0) * 2
+.if \wd >= 8
+        umax            v6.8h,   v6.8h,   v7.8h
+.endif
+        ushr            v3.8h,   v3.8h,   #1
+.if \wd >= 8
+        umax            v4.8h,   v4.8h,   v6.8h
+.endif
+.if \wd >= 6
+        and             v4.16b,  v4.16b,  v14.16b
+.endif
+        umax            v0.8h,   v0.8h,   v1.8h   // max(abs(p1 - p0), abs(q1 - q0))
+        uqadd           v2.8h,   v2.8h,   v3.8h   // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+        umax            v4.8h,   v0.8h,   v4.8h
+        cmhs            v1.8h,   v11.8h,  v4.8h   // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+        cmhs            v1.8h,   v11.8h,  v0.8h   // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+        cmhs            v2.8h,   v10.8h,  v2.8h   // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+        and             v1.16b,  v1.16b,  v2.16b  // fm
+        and             v1.16b,  v1.16b,  v13.16b // fm && wd >= 4
+.if \wd >= 6
+        and             v14.16b, v14.16b, v1.16b  // fm && wd > 4
+.endif
+.if \wd >= 16
+        and             v15.16b, v15.16b, v1.16b  // fm && wd == 16
+.endif
+
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+        adds            x16, x16, x17
+        b.eq            9f                        // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+        movi            v10.8h,  #1
+        uabd            v2.8h,   v21.8h,  v23.8h  // abs(p2 - p0)
+        uabd            v3.8h,   v22.8h,  v23.8h  // abs(p1 - p0)
+        uabd            v4.8h,   v25.8h,  v24.8h  // abs(q1 - q0)
+        uabd            v5.8h,   v26.8h,  v24.8h  // abs(q2 - q0)
+        dup             v9.8h,   w9               // bitdepth_min_8
+.if \wd >= 8
+        uabd            v6.8h,   v20.8h,  v23.8h  // abs(p3 - p0)
+        uabd            v7.8h,   v27.8h,  v24.8h  // abs(q3 - q0)
+.endif
+        umax            v2.8h,   v2.8h,   v3.8h
+        umax            v4.8h,   v4.8h,   v5.8h
+.if \wd >= 8
+        umax            v6.8h,   v6.8h,   v7.8h
+.endif
+        umax            v2.8h,   v2.8h,   v4.8h
+        ushl            v10.8h,  v10.8h,  v9.8h   // F = 1 << bitdepth_min_8
+.if \wd >= 8
+        umax            v2.8h,   v2.8h,   v6.8h
+.endif
+
+.if \wd == 16
+        uabd            v3.8h,   v17.8h,  v23.8h  // abs(p6 - p0)
+        uabd            v4.8h,   v18.8h,  v23.8h  // abs(p5 - p0)
+        uabd            v5.8h,   v19.8h,  v23.8h  // abs(p4 - p0)
+.endif
+        cmhs            v2.8h,   v10.8h,  v2.8h   // flat8in
+.if \wd == 16
+        uabd            v6.8h,   v28.8h,  v24.8h  // abs(q4 - q0)
+        uabd            v7.8h,   v29.8h,  v24.8h  // abs(q5 - q0)
+        uabd            v8.8h,   v30.8h,  v24.8h  // abs(q6 - q0)
+.endif
+        and             v14.16b, v2.16b,  v14.16b // flat8in && fm && wd > 4
+        bic             v1.16b,  v1.16b,  v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+        umax            v3.8h,   v3.8h,   v4.8h
+        umax            v5.8h,   v5.8h,   v6.8h
+.endif
+        mov             x16, v1.d[0]
+        mov             x17, v1.d[1]
+.if \wd == 16
+        umax            v7.8h,   v7.8h,   v8.8h
+        umax            v3.8h,   v3.8h,   v5.8h
+        umax            v3.8h,   v3.8h,   v7.8h
+        cmhs            v3.8h,   v10.8h,  v3.8h   // flat8out
+.endif
+        adds            x16, x16, x17
+.if \wd == 16
+        and             v15.16b, v15.16b, v3.16b  // flat8out && fm && wd == 16
+        and             v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+        bic             v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+        b.eq            1f                        // skip wd == 4 case
+.endif
+
+        dup             v3.8h,   w8               // bitdepth_max
+        sub             v2.8h,   v22.8h,  v25.8h  // p1 - q1
+        ushr            v3.8h,   v3.8h,   #1      // 128 << bitdepth_min_8 - 1
+        cmhi            v0.8h,   v0.8h,   v12.8h  // hev
+        not             v9.16b,  v3.16b           // - 128 * (1 << bitdepth_min_8)
+        smin            v2.8h,   v2.8h,   v3.8h   // iclip_diff(p1 - q1)
+        smax            v2.8h,   v2.8h,   v9.8h   // iclip_diff(p1 - q1)
+        and             v4.16b,  v2.16b,  v0.16b  // if (hev) iclip_diff(p1 - q1)
+        sub             v2.8h,   v24.8h,  v23.8h
+        movi            v5.8h,   #3
+        bic             v0.16b,  v1.16b,  v0.16b  // (fm && wd >= 4 && !hev)
+        mul             v2.8h,   v2.8h,   v5.8h
+        movi            v6.8h,   #4
+        add             v2.8h,   v2.8h,   v4.8h
+        smin            v2.8h,   v2.8h,   v3.8h   // f = iclip_diff()
+        movi            v7.8h,   #3
+        smax            v2.8h,   v2.8h,   v9.8h   // f = iclip_diff()
+        sqadd           v4.8h,   v6.8h,   v2.8h   // f + 4
+        sqadd           v5.8h,   v7.8h,   v2.8h   // f + 3
+        smin            v4.8h,   v4.8h,   v3.8h   // imin(f + 4, 128 << bitdepth_min_8 - 1)
+        smin            v5.8h,   v5.8h,   v3.8h   // imin(f + 3, 128 << bitdepth_min_8 - 1)
+        sshr            v4.8h,   v4.8h,   #3      // f1
+        sshr            v5.8h,   v5.8h,   #3      // f2
+        movi            v9.8h,   #0
+        dup             v3.8h,   w8               // bitdepth_max
+        sqadd           v2.8h,   v23.8h,  v5.8h   // p0 + f2
+        sqsub           v6.8h,   v24.8h,  v4.8h   // q0 - f1
+        srshr           v4.8h,   v4.8h,   #1      // (f1 + 1) >> 1
+        smin            v2.8h,   v2.8h,   v3.8h   // out p0 = iclip_pixel()
+        smin            v6.8h,   v6.8h,   v3.8h   // out q0 = iclip_pixel()
+        smax            v2.8h,   v2.8h,   v9.8h   // out p0 = iclip_pixel()
+        smax            v6.8h,   v6.8h,   v9.8h   // out q0 = iclip_pixel()
+        bit             v23.16b, v2.16b,  v1.16b  // if (fm && wd >= 4)
+        bit             v24.16b, v6.16b,  v1.16b  // if (fm && wd >= 4)
+        sqadd           v2.8h,   v22.8h,  v4.8h   // p1 + f
+        sqsub           v6.8h,   v25.8h,  v4.8h   // q1 - f
+        smin            v2.8h,   v2.8h,   v3.8h   // out p1 = iclip_pixel()
+        smin            v6.8h,   v6.8h,   v3.8h   // out q1 = iclip_pixel()
+        smax            v2.8h,   v2.8h,   v9.8h   // out p1 = iclip_pixel()
+        smax            v6.8h,   v6.8h,   v9.8h   // out q1 = iclip_pixel()
+        bit             v22.16b, v2.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+        bit             v25.16b, v6.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            2f                        // skip if there's no flat8in
+
+        add             v0.8h,   v21.8h,  v21.8h  // p2 * 2
+        add             v2.8h,   v21.8h,  v22.8h  // p2 + p1
+        add             v4.8h,   v22.8h,  v23.8h  // p1 + p0
+        add             v6.8h,   v23.8h,  v24.8h  // p0 + q0
+        add             v8.8h,   v0.8h,   v2.8h
+        add             v10.8h,  v4.8h,   v6.8h
+        add             v12.8h,  v24.8h,  v25.8h  // q0 + q1
+        add             v8.8h,   v8.8h,   v10.8h
+        sub             v12.8h,  v12.8h,  v0.8h
+        add             v10.8h,  v25.8h,  v26.8h  // q1 + q2
+        urshr           v0.8h,   v8.8h,   #3      // out p1
+
+        add             v8.8h,   v8.8h,   v12.8h
+        sub             v10.8h,  v10.8h,  v2.8h
+        add             v12.8h,  v26.8h,  v26.8h  // q2 + q2
+        urshr           v1.8h,   v8.8h,   #3      // out p0
+
+        add             v8.8h,   v8.8h,   v10.8h
+        sub             v12.8h,  v12.8h,  v4.8h
+        urshr           v2.8h,   v8.8h,   #3      // out q0
+
+        bit             v22.16b, v0.16b,  v14.16b // p1 if (flat8in)
+        add             v8.8h,   v8.8h,   v12.8h
+        bit             v23.16b, v1.16b,  v14.16b // p0 if (flat8in)
+        urshr           v3.8h,   v8.8h,   #3      // out q1
+        bit             v24.16b, v2.16b,  v14.16b // q0 if (flat8in)
+        bit             v25.16b, v3.16b,  v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+.if \wd == 8
+        b.eq            8f                        // skip if there's no flat8in
+.else
+        b.eq            2f                        // skip if there's no flat8in
+.endif
+
+        add             v0.8h,   v20.8h,  v21.8h  // p3 + p2
+        add             v2.8h,   v22.8h,  v25.8h  // p1 + q1
+        add             v4.8h,   v20.8h,  v22.8h  // p3 + p1
+        add             v6.8h,   v23.8h,  v26.8h  // p0 + q2
+        add             v8.8h,   v0.8h,   v0.8h   // 2 * (p3 + p2)
+        add             v9.8h,   v23.8h,  v24.8h  // p0 + q0
+        add             v8.8h,   v8.8h,   v4.8h   // + p3 + p1
+        sub             v2.8h,   v2.8h,   v0.8h   // p1 + q1 - p3 - p2
+        add             v8.8h,   v8.8h,   v9.8h   // + p0 + q0
+        sub             v6.8h,   v6.8h,   v4.8h   // p0 + q2 - p3 - p1
+        urshr           v10.8h,  v8.8h,   #3      // out p2
+
+        add             v8.8h,   v8.8h,   v2.8h
+        add             v0.8h,   v20.8h,  v23.8h  // p3 + p0
+        add             v2.8h,   v24.8h,  v27.8h  // q0 + q3
+        urshr           v11.8h,  v8.8h,   #3      // out p1
+
+        add             v8.8h,   v8.8h,   v6.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q0 + q3 - p3 - p0
+        add             v4.8h,   v21.8h,  v24.8h  // p2 + q0
+        add             v6.8h,   v25.8h,  v27.8h  // q1 + q3
+        urshr           v12.8h,  v8.8h,   #3      // out p0
+
+        add             v8.8h,   v8.8h,   v2.8h
+        sub             v6.8h,   v6.8h,   v4.8h   // q1 + q3 - p2 - q0
+        add             v0.8h,   v22.8h,  v25.8h  // p1 + q1
+        add             v2.8h,   v26.8h,  v27.8h  // q2 + q3
+        urshr           v13.8h,  v8.8h,   #3      // out q0
+
+        add             v8.8h,   v8.8h,   v6.8h
+        sub             v2.8h,   v2.8h,   v0.8h   // q2 + q3 - p1 - q1
+        urshr           v0.8h,   v8.8h,   #3      // out q1
+
+        add             v8.8h,   v8.8h,   v2.8h
+
+        bit             v21.16b, v10.16b, v14.16b
+        bit             v22.16b, v11.16b, v14.16b
+        bit             v23.16b, v12.16b, v14.16b
+        urshr           v1.8h,   v8.8h,   #3      // out q2
+        bit             v24.16b, v13.16b, v14.16b
+        bit             v25.16b, v0.16b,  v14.16b
+        bit             v26.16b, v1.16b,  v14.16b
+.endif
+2:
+.if \wd == 16
+        mov             x16, v15.d[0]
+        mov             x17, v15.d[1]
+        adds            x16, x16, x17
+        b.ne            1f                        // check if flat8out is needed
+        mov             x16, v14.d[0]
+        mov             x17, v14.d[1]
+        adds            x16, x16, x17
+        b.eq            8f                        // if there was no flat8in, just write the inner 4 pixels
+        b               7f                        // if flat8in was used, write the inner 6 pixels
+1:
+
+        add             v2.8h,   v17.8h,  v17.8h  // p6 + p6
+        add             v4.8h,   v17.8h,  v18.8h  // p6 + p5
+        add             v6.8h,   v17.8h,  v19.8h  // p6 + p4
+        add             v8.8h,   v17.8h,  v20.8h  // p6 + p3
+        add             v12.8h,  v2.8h,   v4.8h
+        add             v10.8h,  v6.8h,   v8.8h
+        add             v6.8h,   v17.8h,  v21.8h  // p6 + p2
+        add             v12.8h,  v12.8h,  v10.8h
+        add             v8.8h,   v17.8h,  v22.8h  // p6 + p1
+        add             v10.8h,  v18.8h,  v23.8h  // p5 + p0
+        add             v6.8h,   v6.8h,   v8.8h
+        add             v8.8h,   v19.8h,  v24.8h  // p4 + q0
+        add             v12.8h,  v12.8h,  v6.8h
+        add             v10.8h,  v10.8h,  v8.8h
+        add             v6.8h,   v20.8h,  v25.8h  // p3 + q1
+        add             v12.8h,  v12.8h,  v10.8h
+        sub             v6.8h,   v6.8h,   v2.8h
+        add             v2.8h,   v21.8h,  v26.8h  // p2 + q2
+        urshr           v0.8h,   v12.8h,  #4      // out p5
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p6) + (p3 + q1)
+        sub             v2.8h,   v2.8h,   v4.8h
+        add             v4.8h,   v22.8h,  v27.8h  // p1 + q3
+        add             v6.8h,   v17.8h,  v19.8h  // p6 + p4
+        urshr           v1.8h,   v12.8h,  #4      // out p4
+        add             v12.8h,  v12.8h,  v2.8h   // - (p6 + p5) + (p2 + q2)
+        sub             v4.8h,   v4.8h,   v6.8h
+        add             v6.8h,   v23.8h,  v28.8h  // p0 + q4
+        add             v8.8h,   v17.8h,  v20.8h  // p6 + p3
+        urshr           v2.8h,   v12.8h,  #4      // out p3
+        add             v12.8h,  v12.8h,  v4.8h   // - (p6 + p4) + (p1 + q3)
+        sub             v6.8h,   v6.8h,   v8.8h
+        add             v8.8h,   v24.8h,  v29.8h  // q0 + q5
+        add             v4.8h,   v17.8h,  v21.8h  // p6 + p2
+        urshr           v3.8h,   v12.8h,  #4      // out p2
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p3) + (p0 + q4)
+        sub             v8.8h,   v8.8h,   v4.8h
+        add             v6.8h,   v25.8h,  v30.8h  // q1 + q6
+        add             v10.8h,  v17.8h,  v22.8h  // p6 + p1
+        urshr           v4.8h,   v12.8h,  #4      // out p1
+        add             v12.8h,  v12.8h,  v8.8h   // - (p6 + p2) + (q0 + q5)
+        sub             v6.8h,   v6.8h,   v10.8h
+        add             v8.8h,   v26.8h,  v30.8h  // q2 + q6
+        bif             v0.16b,  v18.16b, v15.16b // out p5
+        add             v10.8h,  v18.8h,  v23.8h  // p5 + p0
+        urshr           v5.8h,   v12.8h,  #4      // out p0
+        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p1) + (q1 + q6)
+        sub             v8.8h,   v8.8h,   v10.8h
+        add             v10.8h,  v27.8h,  v30.8h  // q3 + q6
+        bif             v1.16b,  v19.16b, v15.16b // out p4
+        add             v18.8h,  v19.8h,  v24.8h  // p4 + q0
+        urshr           v6.8h,   v12.8h,  #4      // out q0
+        add             v12.8h,  v12.8h,  v8.8h   // - (p5 + p0) + (q2 + q6)
+        sub             v10.8h,  v10.8h,  v18.8h
+        add             v8.8h,   v28.8h,  v30.8h  // q4 + q6
+        bif             v2.16b,  v20.16b, v15.16b // out p3
+        add             v18.8h,  v20.8h,  v25.8h  // p3 + q1
+        urshr           v7.8h,   v12.8h,  #4      // out q1
+        add             v12.8h,  v12.8h,  v10.8h  // - (p4 + q0) + (q3 + q6)
+        sub             v18.8h,  v8.8h,   v18.8h
+        add             v10.8h,  v29.8h,  v30.8h  // q5 + q6
+        bif             v3.16b,  v21.16b, v15.16b // out p2
+        add             v20.8h,  v21.8h,  v26.8h  // p2 + q2
+        urshr           v8.8h,   v12.8h,  #4      // out q2
+        add             v12.8h,  v12.8h,  v18.8h  // - (p3 + q1) + (q4 + q6)
+        sub             v10.8h,  v10.8h,  v20.8h
+        add             v18.8h,  v30.8h,  v30.8h  // q6 + q6
+        bif             v4.16b,  v22.16b, v15.16b // out p1
+        add             v20.8h,  v22.8h,  v27.8h  // p1 + q3
+        urshr           v9.8h,   v12.8h,  #4      // out q3
+        add             v12.8h,  v12.8h,  v10.8h  // - (p2 + q2) + (q5 + q6)
+        sub             v18.8h,  v18.8h,  v20.8h
+        bif             v5.16b,  v23.16b, v15.16b // out p0
+        urshr           v10.8h,  v12.8h,  #4      // out q4
+        add             v12.8h,  v12.8h,  v18.8h  // - (p1 + q3) + (q6 + q6)
+        urshr           v11.8h,  v12.8h,  #4      // out q5
+        bif             v6.16b,  v24.16b, v15.16b // out q0
+        bif             v7.16b,  v25.16b, v15.16b // out q1
+        bif             v8.16b,  v26.16b, v15.16b // out q2
+        bif             v9.16b,  v27.16b, v15.16b // out q3
+        bif             v10.16b, v28.16b, v15.16b // out q4
+        bif             v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+        ret
+.if \wd == 16
+7:
+        // Return to a shorter epilogue, writing only the inner 6 pixels
+        br              x13
+.endif
+.if \wd >= 8
+8:
+        // Return to a shorter epilogue, writing only the inner 4 pixels
+        br              x14
+.endif
+9:
+        // Return directly without writing back any pixels
+        br              x15
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+        adr             x13, 7f
+        adr             x14, 8f
+        bl              lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+        adr             x14, 8f
+        bl              lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+        bl              lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+        bl              lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+
+        lpf_8_wd4
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_4_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #4
+        add             x0,  x16, x1, lsl #2
+        ld1             {v22.d}[0], [x16], x1
+        ld1             {v22.d}[1], [x0],  x1
+        ld1             {v23.d}[0], [x16], x1
+        ld1             {v23.d}[1], [x0],  x1
+        ld1             {v24.d}[0], [x16], x1
+        ld1             {v24.d}[1], [x0],  x1
+        ld1             {v25.d}[0], [x16], x1
+        ld1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd4
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_6_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16, x1
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        lpf_8_wd6
+
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_6_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #8
+        add             x0,  x16, x1, lsl #2
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd6
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_8_8_neon
+        mov             x15, x30
+        sub             x16, x0,  x1, lsl #2
+        ld1             {v20.8h}, [x16], x1 // p3
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v26.8h}, [x0],  x1 // q2
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v27.8h}, [x0],  x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+
+        lpf_8_wd8
+
+        sub             x16, x0,  x1, lsl #1
+        sub             x16, x16,  x1
+        st1             {v21.8h}, [x16], x1 // p2
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v25.8h}, [x0],  x1 // q1
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_8_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #8
+        add             x0,  x16, x1, lsl #2
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        lpf_8_wd8
+
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #8
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v20.8h}, [x16], x1
+        st1             {v24.8h}, [x0],  x1
+        st1             {v21.8h}, [x16], x1
+        st1             {v25.8h}, [x0],  x1
+        st1             {v22.8h}, [x16], x1
+        st1             {v26.8h}, [x0],  x1
+        st1             {v23.8h}, [x16], x1
+        st1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+function lpf_v_16_8_neon
+        mov             x15, x30
+
+        sub             x16, x0,  x1, lsl #3
+        add             x16, x16, x1
+        ld1             {v17.8h}, [x16], x1 // p6
+        ld1             {v24.8h}, [x0],  x1 // q0
+        ld1             {v18.8h}, [x16], x1 // p5
+        ld1             {v25.8h}, [x0],  x1 // q1
+        ld1             {v19.8h}, [x16], x1 // p4
+        ld1             {v26.8h}, [x0],  x1 // q2
+        ld1             {v20.8h}, [x16], x1 // p3
+        ld1             {v27.8h}, [x0],  x1 // q3
+        ld1             {v21.8h}, [x16], x1 // p2
+        ld1             {v28.8h}, [x0],  x1 // q4
+        ld1             {v22.8h}, [x16], x1 // p1
+        ld1             {v29.8h}, [x0],  x1 // q5
+        ld1             {v23.8h}, [x16], x1 // p0
+        ld1             {v30.8h}, [x0],  x1 // q6
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        lpf_8_wd16
+
+        sub             x16, x0,  x1, lsl #2
+        sub             x16, x16, x1, lsl #1
+        st1             {v0.8h},  [x16], x1 // p5
+        st1             {v6.8h},  [x0],  x1 // q0
+        st1             {v1.8h},  [x16], x1 // p4
+        st1             {v7.8h},  [x0],  x1 // q1
+        st1             {v2.8h},  [x16], x1 // p3
+        st1             {v8.8h},  [x0],  x1 // q2
+        st1             {v3.8h},  [x16], x1 // p2
+        st1             {v9.8h},  [x0],  x1 // q3
+        st1             {v4.8h},  [x16], x1 // p1
+        st1             {v10.8h}, [x0],  x1 // q4
+        st1             {v5.8h},  [x16], x1 // p0
+        st1             {v11.8h}, [x0],  x1 // q5
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+7:
+        sub             x16, x0,  x1
+        sub             x16, x16, x1, lsl #1
+        st1             {v21.8h}, [x16], x1 // p2
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v25.8h}, [x0],  x1 // q1
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v26.8h}, [x0],  x1 // q2
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x15
+
+8:
+        sub             x16, x0,  x1, lsl #1
+        st1             {v22.8h}, [x16], x1 // p1
+        st1             {v24.8h}, [x0],  x1 // q0
+        st1             {v23.8h}, [x16], x1 // p0
+        st1             {v25.8h}, [x0],  x1 // q1
+        sub             x0,  x0,  x1, lsl #1
+        br              x15
+endfunc
+
+function lpf_h_16_8_neon
+        mov             x15, x30
+        sub             x16, x0,  #16
+        ld1             {v16.8h}, [x16], x1
+        ld1             {v24.8h}, [x0],  x1
+        ld1             {v17.8h}, [x16], x1
+        ld1             {v25.8h}, [x0],  x1
+        ld1             {v18.8h}, [x16], x1
+        ld1             {v26.8h}, [x0],  x1
+        ld1             {v19.8h}, [x16], x1
+        ld1             {v27.8h}, [x0],  x1
+        ld1             {v20.8h}, [x16], x1
+        ld1             {v28.8h}, [x0],  x1
+        ld1             {v21.8h}, [x16], x1
+        ld1             {v29.8h}, [x0],  x1
+        ld1             {v22.8h}, [x16], x1
+        ld1             {v30.8h}, [x0],  x1
+        ld1             {v23.8h}, [x16], x1
+        ld1             {v31.8h}, [x0],  x1
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+
+        lpf_8_wd16
+
+        sub             x0,  x0,  x1, lsl #3
+        sub             x16, x0,  #16
+
+        transpose_8x8h  v16, v17, v0,  v1,  v2,  v3,  v4,  v5,  v18, v19
+        transpose_8x8h  v6,  v7,  v8,  v9,  v10, v11, v30, v31, v18, v19
+
+        st1             {v16.8h}, [x16], x1
+        st1             {v6.8h},  [x0],  x1
+        st1             {v17.8h}, [x16], x1
+        st1             {v7.8h},  [x0],  x1
+        st1             {v0.8h},  [x16], x1
+        st1             {v8.8h},  [x0],  x1
+        st1             {v1.8h},  [x16], x1
+        st1             {v9.8h},  [x0],  x1
+        st1             {v2.8h},  [x16], x1
+        st1             {v10.8h}, [x0],  x1
+        st1             {v3.8h},  [x16], x1
+        st1             {v11.8h}, [x0],  x1
+        st1             {v4.8h},  [x16], x1
+        st1             {v30.8h}, [x0],  x1
+        st1             {v5.8h},  [x16], x1
+        st1             {v31.8h}, [x0],  x1
+        br              x15
+
+7:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #8
+        transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v20.8h}, [x16], x1
+        st1             {v24.8h}, [x0],  x1
+        st1             {v21.8h}, [x16], x1
+        st1             {v25.8h}, [x0],  x1
+        st1             {v22.8h}, [x16], x1
+        st1             {v26.8h}, [x0],  x1
+        st1             {v23.8h}, [x16], x1
+        st1             {v27.8h}, [x0],  x1
+        add             x0,  x0,  #8
+        br              x15
+8:
+        sub             x16, x0,  x1, lsl #3
+        sub             x16, x16, #4
+        transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
+        add             x0,  x16, x1, lsl #2
+
+        st1             {v22.d}[0], [x16], x1
+        st1             {v22.d}[1], [x0],  x1
+        st1             {v23.d}[0], [x16], x1
+        st1             {v23.d}[1], [x0],  x1
+        st1             {v24.d}[0], [x16], x1
+        st1             {v24.d}[1], [x0],  x1
+        st1             {v25.d}[0], [x16], x1
+        st1             {v25.d}[1], [x0],  x1
+        add             x0,  x0,  #4
+        br              x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                  const uint32_t *const vmask,
+//                                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                  const Av1FilterLUT *lut, const int w,
+//                                  const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+        mov             x11, x30
+        mov             w8,  w7  // bitdepth_max
+        clz             w9,  w8
+        mov             w10, #24
+        sub             w9,  w10,  w9 // bitdepth_min_8
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+        ldp             w6,  w7,  [x2]           // vmask[0], vmask[1]
+.ifc \type, y
+        ldr             w2,  [x2, #8]            // vmask[2]
+.endif
+        add             x5,  x5,  #128           // Move to sharp part of lut
+.ifc \type, y
+        orr             w7,  w7,  w2             // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+        sub             x4,  x3,  x4, lsl #2
+.else
+        sub             x3,  x3,  #4
+        lsl             x4,  x4,  #2
+.endif
+        orr             w6,  w6,  w7             // vmask[0] |= vmask[1]
+
+1:
+        tst             w6,  #0x0f
+.ifc \dir, v
+        ld1             {v0.8b}, [x4], #8
+        ld1             {v1.8b}, [x3], #8
+.else
+        ld2             {v0.s,v1.s}[0], [x3], x4
+        ld2             {v0.s,v1.s}[1], [x3], x4
+.endif
+        b.eq            7f                        // if (!(vm & bits)) continue;
+
+        ld1r            {v5.8b}, [x5]             // sharp[0]
+        add             x5,  x5,  #8
+        movi            v2.2s,   #0xff
+        dup             v13.2s,  w6               // vmask[0]
+        dup             v31.8h,  w9               // bitdepth_min_8
+
+        and             v0.8b,   v0.8b,   v2.8b   // Keep only lowest byte in each 32 bit word
+        and             v1.8b,   v1.8b,   v2.8b
+        cmtst           v3.8b,   v1.8b,   v2.8b   // Check for nonzero values in l[0][0]
+        movi            v4.8b,   #1
+        ld1r            {v6.8b}, [x5]             // sharp[1]
+        sub             x5,  x5,  #8
+        bif             v1.8b,   v0.8b,   v3.8b   // if (!l[0][0]) L = l[offset][0]
+        mul             v1.2s,   v1.2s,   v4.2s   // L
+.ifc \type, y
+        dup             v15.2s,  w2               // vmask[2]
+.endif
+        cmtst           v2.2s,   v1.2s,   v2.2s   // L != 0
+        dup             v14.2s,  w7               // vmask[1]
+        mov             x16, v2.d[0]
+        cmp             x16, #0
+        b.eq            7f                        // if (!L) continue;
+        neg             v5.8b,   v5.8b            // -sharp[0]
+        movrel          x16,  word_12
+        ushr            v12.8b,  v1.8b,   #4      // H
+        ld1             {v16.2s}, [x16]
+        sshl            v3.8b,   v1.8b,   v5.8b   // L >> sharp[0]
+.ifc \type, y
+        cmtst           v15.2s,  v15.2s,  v16.2s  // if (vmask[2] & bits)
+.endif
+        movi            v7.8b,   #2
+        umin            v3.8b,   v3.8b,   v6.8b   // imin(L >> sharp[0], sharp[1])
+        add             v0.8b,   v1.8b,   v7.8b   // L + 2
+        umax            v11.8b,  v3.8b,   v4.8b   // imax(imin(), 1) = limit = I
+        add             v0.8b,   v0.8b,   v0.8b   // 2*(L + 2)
+        cmtst           v14.2s,  v14.2s,  v16.2s  // if (vmask[1] & bits)
+        uxtl            v12.8h,  v12.8b
+        add             v10.8b,  v0.8b,   v11.8b  // 2*(L + 2) + limit = E
+        cmtst           v13.2s,  v13.2s,  v16.2s  // if (vmask[0] & bits)
+        uxtl            v11.8h,  v11.8b
+        uxtl            v10.8h,  v10.8b
+        and             v13.8b,  v13.8b,  v2.8b   // vmask[0] &= L != 0
+        sxtl            v14.8h,  v14.8b
+        sxtl            v13.8h,  v13.8b
+.ifc \type, y
+        sxtl            v15.8h,  v15.8b
+.endif
+        ushl            v12.8h,  v12.8h,  v31.8h
+        ushl            v11.8h,  v11.8h,  v31.8h
+        ushl            v10.8h,  v10.8h,  v31.8h
+
+.ifc \type, y
+        tst             w2,  #0x0f
+        b.eq            2f
+        // wd16
+        bl              lpf_\dir\()_16_8_neon
+        b               8f
+2:
+.endif
+        tst             w7,  #0x0f
+        b.eq            3f
+.ifc \type, y
+        // wd8
+        bl              lpf_\dir\()_8_8_neon
+.else
+        // wd6
+        bl              lpf_\dir\()_6_8_neon
+.endif
+        b               8f
+3:
+        // wd4
+        bl              lpf_\dir\()_4_8_neon
+.ifc \dir, h
+        b               8f
+7:
+        // For dir h, the functions above increment x0.
+        // If the whole function is skipped, increment it here instead.
+        add             x0,  x0,  x1,  lsl #3
+.else
+7:
+.endif
+8:
+        lsr             w6,  w6,  #2              // vmask[0] >>= 2
+        lsr             w7,  w7,  #2              // vmask[1] >>= 2
+.ifc \type, y
+        lsr             w2,  w2,  #2              // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+        add             x0,  x0,  #16
+.else
+        // For dir h, x0 is returned incremented
+.endif
+        cbnz            w6,  1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        br              x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12
+        .word 1, 2
+endconst
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@ -28,11 +28,11 @@
 #include "src/arm/asm.S"
 #include "util.S"

-// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-//                                 const pixel *src, ptrdiff_t stride,
-//                                 const int16_t fh[7], const intptr_t w,
-//                                 int h, enum LrEdgeFlags edges);
-function wiener_filter_h_neon, export=1
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                      const pixel *src, ptrdiff_t stride,
+//                                      const int16_t fh[7], const intptr_t w,
+//                                      int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
        mov             w8,  w5
        ld1             {v0.8h},  [x4]
        mov             w9,  #(1 << 14) - (1 << 2)
@ -306,11 +306,11 @@ L(variable_shift_tbl):
 .purgem filter
 endfunc

-// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-//                                 const int16_t *mid, int w, int h,
-//                                 const int16_t fv[7], enum LrEdgeFlags edges,
-//                                 ptrdiff_t mid_stride);
-function wiener_filter_v_neon, export=1
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                      const int16_t *mid, int w, int h,
+//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
        mov             w8,  w4
        ld1             {v0.8h},  [x5]
        movi            v1.8h, #128
@ -482,9 +482,9 @@ function wiener_filter_v_neon, export=1
 .purgem filter
 endfunc

-// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-//                             const pixel *src, int w, int h);
-function copy_narrow_neon, export=1
+// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                  const pixel *src, int w, int h);
+function copy_narrow_8bpc_neon, export=1
        adr             x5,  L(copy_narrow_tbl)
        ldrh            w6,  [x5, w3, uxtw #1]
        sub             x5,  x5,  w6, uxth
@ -617,12 +617,14 @@ endfunc

 #define SUM_STRIDE (384+16)

-// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
-//                            const pixel (*left)[4],
-//                            const pixel *src, const ptrdiff_t stride,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_h_neon, export=1
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
        add             w5,  w5,  #2 // w += 2

        // Set up pointers for reading/writing alternate rows
@ -844,11 +846,11 @@ L(box3_variable_shift_tbl):
        umull2          v6.8h,   v4.16b,  v4.16b

        add3            4
+        subs            w5,  w5,  #4
        st1             {v3.4h},  [x1],  #8
        st1             {v7.4h},  [x11], #8
        st1             {v26.4s}, [x0],  #16
        st1             {v28.4s}, [x10], #16
-        subs            w5,  w5,  #4
        b.le            9f
        ext             v0.16b,  v0.16b,  v0.16b, #4
        ext             v4.16b,  v4.16b,  v4.16b, #4
@ -879,12 +881,12 @@ L(box3_variable_shift_tbl):
 .purgem add3
 endfunc

-// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
-//                            const pixel (*left)[4],
-//                            const pixel *src, const ptrdiff_t stride,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_h_neon, export=1
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                 const pixel (*left)[4],
+//                                 const pixel *src, const ptrdiff_t stride,
+//                                 const int w, const int h,
+//                                 const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
        add             w5,  w5,  #2 // w += 2

        // Set up pointers for reading/writing alternate rows
@ -950,7 +952,7 @@ function sgr_box5_h_neon, export=1
        b               2f
 0:
        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
-        // and shift v0 to have 2x the first byte at the front.
+        // and shift v0 to have 3x the first byte at the front.
        dup             v1.16b, v0.b[0]
        dup             v5.16b, v4.b[0]
        // Move x3 back to account for the last 3 bytes we loaded before,
@ -1114,11 +1116,11 @@ L(box5_variable_shift_tbl):
        umull2          v6.8h,   v4.16b,  v4.16b

        add5            4
+        subs            w5,  w5,  #4
        st1             {v3.4h},  [x1],  #8
        st1             {v7.4h},  [x11], #8
        st1             {v26.4s}, [x0],  #16
        st1             {v28.4s}, [x10], #16
-        subs            w5,  w5,  #4
        b.le            9f
        ext             v0.16b,  v0.16b,  v0.16b, #4
        ext             v1.16b,  v1.16b,  v2.16b, #8
@ -1147,839 +1149,4 @@ L(box5_variable_shift_tbl):
 .purgem add5
 endfunc

-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #2 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
-
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            1f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
-        add             w11, w11, #2
-1:
-        mov             w9,  w3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v21 and v24-v26 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v24.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v18.4s, v19.4s}, [x5], x7
-        ld1             {v25.8h},         [x6], x8
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v25.16b, v24.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v26.16b, v24.16b
-
-3:
-        subs            w3,  w3,  #1
-.macro add3
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v24.8h,  v24.8h,  v25.8h
-        add             v16.4s,  v16.4s,  v20.4s
-        add             v17.4s,  v17.4s,  v21.4s
-        add             v24.8h,  v24.8h,  v26.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v24.8h},         [x1], x8
-.endm
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v25.16b, v26.16b
-        b.le            4f
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3b
-
-4:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            5f
-        // !LR_HAVE_BOTTOM
-        // Produce two more rows, extending the already loaded rows.
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        add3
-
-5:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #8 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
-
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            0f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_BOTTOM
-        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
-1:
-        mov             w9,  w3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v25 and v26-v30 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v28.8h},         [x6], x8
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v28.16b, v26.16b
-        mov             v22.16b, v16.16b
-        mov             v23.16b, v17.16b
-        mov             v29.16b, v26.16b
-
-3:
-        cbz             w3,  4f
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-
-3:
-        // Start of vertical loop
-        subs            w3,  w3,  #2
-.macro add5
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v26.8h,  v26.8h,  v27.8h
-        add             v0.4s,   v20.4s,  v22.4s
-        add             v1.4s,   v21.4s,  v23.4s
-        add             v2.8h,   v28.8h,  v29.8h
-        add             v16.4s,  v16.4s,  v24.4s
-        add             v17.4s,  v17.4s,  v25.4s
-        add             v26.8h,  v26.8h,  v30.8h
-        add             v16.4s,  v16.4s,  v0.4s
-        add             v17.4s,  v17.4s,  v1.4s
-        add             v26.8h,  v26.8h,  v2.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v26.8h},         [x1], x8
-.endm
-        add5
-.macro shift2
-        mov             v16.16b, v20.16b
-        mov             v17.16b, v21.16b
-        mov             v26.16b, v28.16b
-        mov             v18.16b, v22.16b
-        mov             v19.16b, v23.16b
-        mov             v27.16b, v29.16b
-        mov             v20.16b, v24.16b
-        mov             v21.16b, v25.16b
-        mov             v28.16b, v30.16b
-.endm
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b.le            5f
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-        b               3b
-
-4:
-        // h == 1, !LR_HAVE_BOTTOM.
-        // Pad the last row with the only content row, and add.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        add5
-        b               6f
-
-5:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            6f
-        // !LR_HAVE_BOTTOM
-        cbnz            w3,  5f
-        // The intended three edge rows left; output the one at h-2 and
-        // the past edge one at h.
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        // Pad the past-edge row from the last content row.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        // The last two rows are already padded properly here.
-        add5
-        b               6f
-
-5:
-        // w3 == -1, two rows left, output one.
-        // Pad the last two rows from the mid one.
-        mov             v22.16b, v20.16b
-        mov             v23.16b, v21.16b
-        mov             v29.16b, v28.16b
-        mov             v24.16b, v20.16b
-        mov             v25.16b, v21.16b
-        mov             v30.16b, v28.16b
-        add5
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b               6f
-
-6:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
-        add             x3,  x3,  #2 // h += 2
-        movi            v31.4s,   #9 // n
-        mov             x5,  #455
-        mov             x8,  #SUM_STRIDE
-        b               sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
-        add             x3,  x3,  #3  // h += 3
-        asr             x3,  x3,  #1  // h /= 2
-        movi            v31.4s,   #25 // n
-        mov             x5,  #164
-        mov             x8,  #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
-        movrel          x12, X(sgr_x_by_x)
-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
-        movi            v19.16b,  #5
-        movi            v20.8b,   #55  // idx of last 5
-        movi            v21.8b,   #72  // idx of last 4
-        movi            v22.8b,   #101 // idx of last 3
-        movi            v23.8b,   #169 // idx of last 2
-        movi            v24.8b,   #254 // idx of last 1
-        add             x2,  x2,  #2 // w += 2
-        add             x7,  x2,  #7
-        bic             x7,  x7,  #7 // aligned w
-        sub             x7,  x8,  x7 // increment between rows
-        movi            v29.8h,   #1, lsl #8
-        dup             v28.4s,   w4
-        dup             v30.4s,   w5 // one_by_x
-        sub             x0,  x0,  #(4*(SUM_STRIDE))
-        sub             x1,  x1,  #(2*(SUM_STRIDE))
-        mov             x6,  x2   // backup of w
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
-1:
-        subs            x2,  x2,  #8
-        ld1             {v0.4s, v1.4s}, [x0]   // a
-        ld1             {v2.8h}, [x1]          // b
-        mul             v0.4s,  v0.4s,  v31.4s // a * n
-        mul             v1.4s,  v1.4s,  v31.4s // a * n
-        umull           v3.4s,  v2.4h,  v2.4h  // b * b
-        umull2          v4.4s,  v2.8h,  v2.8h  // b * b
-        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,  v0.4s,  v28.4s // p * s
-        mul             v1.4s,  v1.4s,  v28.4s // p * s
-        uqshrn          v0.4h,  v0.4s,  #16
-        uqshrn2         v0.8h,  v1.4s,  #16
-        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)
-
-        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b, v25.8b, v26.8b
-        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b, v27.8b, v5.8b
-        add             v6.8b,  v6.8b,  v19.8b
-        add             v25.8b, v25.8b, v27.8b
-        add             v1.8b,  v1.8b,  v6.8b
-        add             v1.8b,  v1.8b,  v25.8b
-        uxtl            v1.8h,  v1.8b          // x
-
-        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
-        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
-        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,  v3.4s,  #12    // AA[i]
-        srshr           v4.4s,  v4.4s,  #12    // AA[i]
-        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
-
-        st1             {v3.4s, v4.4s}, [x0], #32
-        st1             {v2.8h}, [x1], #16
-        b.gt            1b
-
-        subs            x3,  x3,  #1
-        b.le            0f
-        add             x0,  x0,  x7, lsl #2
-        add             x1,  x1,  x7, lsl #1
-        mov             x2,  x6
-        b               1b
-0:
-        ret
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
-//                                    const pixel *src, const ptrdiff_t stride,
-//                                    const int32_t *a, const int16_t *b,
-//                                    const int w, const int h);
-function sgr_finish_filter1_neon, export=1
-        sub             x7,  x3,  #(4*SUM_STRIDE)
-        add             x8,  x3,  #(4*SUM_STRIDE)
-        sub             x9,  x4,  #(2*SUM_STRIDE)
-        add             x10, x4,  #(2*SUM_STRIDE)
-        mov             x11, #SUM_STRIDE
-        mov             x12, #FILTER_OUT_STRIDE
-        add             x13, x5,  #7
-        bic             x13, x13, #7 // Aligned width
-        sub             x2,  x2,  x13
-        sub             x12, x12, x13
-        sub             x11, x11, x13
-        sub             x11, x11, #4 // We read 4 extra elements from a
-        sub             x14, x11, #4 // We read 8 extra elements from b
-        mov             x13, x5
-        movi            v6.8h,  #3
-        movi            v7.4s,  #3
-1:
-        ld1             {v0.8h, v1.8h}, [x9], #32
-        ld1             {v2.8h, v3.8h}, [x4], #32
-        ld1             {v4.8h, v5.8h}, [x10], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
-        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
-        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
-
-2:
-        subs            x5,  x5,  #8
-        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
-        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
-        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
-        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
-        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
-        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
-        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
-        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
-        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
-        add             v2.8h,   v2.8h,   v26.8h
-        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
-        add             v2.8h,   v2.8h,   v29.8h      // +1
-        add             v0.8h,   v0.8h,   v4.8h
-
-        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
-        ext             v26.16b, v17.16b, v18.16b, #4
-        shl             v2.8h,   v2.8h,   #2
-        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
-        ext             v28.16b, v17.16b, v18.16b, #8
-        ext             v29.16b, v19.16b, v20.16b, #4 // 0
-        ext             v30.16b, v20.16b, v21.16b, #4
-        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
-        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
-        add             v26.4s,  v26.4s,  v20.4s
-        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
-        add             v17.4s,  v17.4s,  v28.4s
-        ext             v27.16b, v19.16b, v20.16b, #8 // +1
-        ext             v28.16b, v20.16b, v21.16b, #8
-        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
-        add             v17.4s,  v17.4s,  v23.4s
-        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
-        add             v30.4s,  v30.4s,  v28.4s
-        add             v25.4s,  v25.4s,  v29.4s
-        add             v26.4s,  v26.4s,  v30.4s
-        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
-        ext             v28.16b, v23.16b, v24.16b, #4
-        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
-        ext             v30.16b, v23.16b, v24.16b, #8
-        ld1             {v19.8b}, [x1], #8            // src
-        add             v25.4s,  v25.4s,  v27.4s      // +stride
-        add             v26.4s,  v26.4s,  v28.4s
-        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
-        add             v17.4s,  v17.4s,  v30.4s
-        shl             v25.4s,  v25.4s,  #2
-        shl             v26.4s,  v26.4s,  #2
-        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
-        mla             v26.4s,  v17.4s,  v7.4s
-        uxtl            v19.8h,  v19.8b               // src
-        mov             v0.16b,  v1.16b
-        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
-        umlal2          v26.4s,  v2.8h,   v19.8h
-        mov             v2.16b,  v3.16b
-        rshrn           v25.4h,  v25.4s,  #9
-        rshrn2          v25.8h,  v26.4s,  #9
-        mov             v4.16b,  v5.16b
-        st1             {v25.8h}, [x0], #16
-
-        b.le            3f
-        mov             v16.16b, v18.16b
-        mov             v19.16b, v21.16b
-        mov             v22.16b, v24.16b
-        ld1             {v1.8h}, [x9], #16
-        ld1             {v3.8h}, [x4], #16
-        ld1             {v5.8h}, [x10], #16
-        ld1             {v17.4s, v18.4s}, [x7], #32
-        ld1             {v20.4s, v21.4s}, [x3], #32
-        ld1             {v23.4s, v24.4s}, [x8], #32
-        b               2b
-
-3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x13
-        add             x0,  x0,  x12, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x11, lsl #2
-        add             x7,  x7,  x11, lsl #2
-        add             x8,  x8,  x11, lsl #2
-        add             x4,  x4,  x14, lsl #1
-        add             x9,  x9,  x14, lsl #1
-        add             x10, x10, x14, lsl #1
-        b               1b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
-//                                    const pixel *src, const ptrdiff_t stride,
-//                                    const int32_t *a, const int16_t *b,
-//                                    const int w, const int h);
-function sgr_finish_filter2_neon, export=1
-        add             x7,  x3,  #(4*(SUM_STRIDE))
-        sub             x3,  x3,  #(4*(SUM_STRIDE))
-        add             x8,  x4,  #(2*(SUM_STRIDE))
-        sub             x4,  x4,  #(2*(SUM_STRIDE))
-        mov             x9,  #(2*SUM_STRIDE)
-        mov             x10, #FILTER_OUT_STRIDE
-        add             x11, x5,  #7
-        bic             x11, x11, #7 // Aligned width
-        sub             x2,  x2,  x11
-        sub             x10, x10, x11
-        sub             x9,  x9,  x11
-        sub             x9,  x9,  #4 // We read 4 extra elements from a
-        sub             x12, x9,  #4 // We read 8 extra elements from b
-        mov             x11, x5
-        movi            v4.8h,  #5
-        movi            v5.4s,  #5
-        movi            v6.8h,  #6
-        movi            v7.4s,  #6
-1:
-        ld1             {v0.8h, v1.8h}, [x4], #32
-        ld1             {v2.8h, v3.8h}, [x8], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
-        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
-
-2:
-        subs            x5,  x5,  #8
-        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
-        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
-        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
-        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
-        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
-        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
-        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
-        add             v0.8h,   v0.8h,   v25.8h
-
-        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
-        ext             v23.16b, v17.16b, v18.16b, #4
-        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
-        ext             v25.16b, v20.16b, v21.16b, #4
-        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
-        ext             v27.16b, v17.16b, v18.16b, #8
-        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
-        ext             v29.16b, v20.16b, v21.16b, #8
-        mul             v0.8h,   v0.8h,   v4.8h       // * 5
-        mla             v0.8h,   v2.8h,   v6.8h       // * 6
-        ld1             {v31.8b}, [x1], #8
-        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
-        add             v17.4s,  v17.4s,  v27.4s
-        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
-        add             v20.4s,  v20.4s,  v29.4s
-        add             v16.4s,  v16.4s,  v19.4s
-        add             v17.4s,  v17.4s,  v20.4s
-
-        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
-        add             v23.4s,  v23.4s,  v25.4s
-        // This is, surprisingly, faster than other variants where the
-        // mul+mla pairs are further apart, on Cortex A53.
-        mul             v16.4s,  v16.4s,  v5.4s       // * 5
-        mla             v16.4s,  v22.4s,  v7.4s       // * 6
-        mul             v17.4s,  v17.4s,  v5.4s       // * 5
-        mla             v17.4s,  v23.4s,  v7.4s       // * 6
-
-        uxtl            v31.8h,  v31.8b
-        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
-        umlal2          v17.4s,  v0.8h,   v31.8h
-        mov             v0.16b,  v1.16b
-        rshrn           v16.4h,  v16.4s,  #9
-        rshrn2          v16.8h,  v17.4s,  #9
-        mov             v2.16b,  v3.16b
-        st1             {v16.8h}, [x0], #16
-
-        b.le            3f
-        mov             v16.16b, v18.16b
-        mov             v19.16b, v21.16b
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v3.8h}, [x8], #16
-        ld1             {v17.4s, v18.4s}, [x3], #32
-        ld1             {v20.4s, v21.4s}, [x7], #32
-        b               2b
-
-3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x9, lsl #2
-        add             x7,  x7,  x9, lsl #2
-        add             x4,  x4,  x12, lsl #1
-        add             x8,  x8,  x12, lsl #1
-        mov             x13, x3
-        mov             x14, x4
-
-        ld1             {v0.8h, v1.8h}, [x4], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
-
-4:
-        subs            x5,  x5,  #8
-        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
-        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
-        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
-
-        ext             v24.16b, v16.16b, v17.16b, #4 // 0
-        ext             v25.16b, v17.16b, v18.16b, #4
-        ext             v26.16b, v16.16b, v17.16b, #8 // +1
-        ext             v27.16b, v17.16b, v18.16b, #8
-        mul             v2.8h,   v22.8h,  v6.8h       // * 6
-        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
-        ld1             {v31.8b}, [x1], #8
-        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
-        add             v17.4s,  v17.4s,  v27.4s
-        uxtl            v31.8h,  v31.8b
-        // This is, surprisingly, faster than other variants where the
-        // mul+mla pairs are further apart, on Cortex A53.
-        mul             v24.4s,  v24.4s,  v7.4s       // * 6
-        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
-        mul             v25.4s,  v25.4s,  v7.4s       // * 6
-        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
-
-        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
-        umlal2          v25.4s,  v2.8h,   v31.8h
-        mov             v0.16b,  v1.16b
-        rshrn           v24.4h,  v24.4s,  #8
-        rshrn2          v24.8h,  v25.4s,  #8
-        mov             v16.16b, v18.16b
-        st1             {v24.8h}, [x0], #16
-
-        b.le            5f
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v17.4s, v18.4s}, [x3], #32
-        b               4b
-
-5:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        mov             x3,  x13 // Rewind x3/x4 to where they started
-        mov             x4,  x14
-        b               1b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const int w, const int h,
-//                               const int wt);
-function sgr_weighted1_neon, export=1
-        dup             v31.8h, w7
-        cmp             x6,  #2
-        add             x9,  x0,  x1
-        add             x10, x2,  x3
-        add             x11, x4,  #2*FILTER_OUT_STRIDE
-        mov             x7,  #(4*FILTER_OUT_STRIDE)
-        lsl             x1,  x1,  #1
-        lsl             x3,  x3,  #1
-        add             x8,  x5,  #7
-        bic             x8,  x8,  #7 // Aligned width
-        sub             x1,  x1,  x8
-        sub             x3,  x3,  x8
-        sub             x7,  x7,  x8, lsl #1
-        mov             x8,  x5
-        b.lt            2f
-1:
-        ld1             {v0.8b}, [x2],  #8
-        ld1             {v4.8b}, [x10], #8
-        ld1             {v1.8h}, [x4],  #16
-        ld1             {v5.8h}, [x11], #16
-        subs            x5,  x5,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        ushll           v4.8h,  v4.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        ushll           v6.4s,  v4.4h,  #7     // u << 7
-        ushll2          v7.4s,  v4.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
-        smlal           v6.4s,  v5.4h,  v31.4h // v
-        smlal2          v7.4s,  v5.8h,  v31.8h // v
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        rshrn           v6.4h,  v6.4s,  #11
-        rshrn2          v6.8h,  v7.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v6.8b,  v6.8h
-        st1             {v2.8b}, [x0], #8
-        st1             {v6.8b}, [x9], #8
-        b.gt            1b
-
-        sub             x6,  x6,  #2
-        cmp             x6,  #1
-        b.lt            0f
-        mov             x5,  x8
-        add             x0,  x0,  x1
-        add             x9,  x9,  x1
-        add             x2,  x2,  x3
-        add             x10, x10, x3
-        add             x4,  x4,  x7
-        add             x11, x11, x7
-        b.eq            2f
-        b               1b
-
-2:
-        ld1             {v0.8b}, [x2], #8
-        ld1             {v1.8h}, [x4], #16
-        subs            x5,  x5,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        st1             {v2.8b}, [x0], #8
-        b.gt            2b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
-//                               const pixel *src, const ptrdiff_t src_stride,
-//                               const coef *t1, const coef *t2,
-//                               const int w, const int h,
-//                               const int16_t wt[2]);
-function sgr_weighted2_neon, export=1
-        ldr             x8,  [sp]
-        cmp             x7,  #2
-        add             x10, x0,  x1
-        add             x11, x2,  x3
-        add             x12, x4,  #2*FILTER_OUT_STRIDE
-        add             x13, x5,  #2*FILTER_OUT_STRIDE
-        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
-        mov             x8,  #4*FILTER_OUT_STRIDE
-        lsl             x1,  x1,  #1
-        lsl             x3,  x3,  #1
-        add             x9,  x6,  #7
-        bic             x9,  x9,  #7 // Aligned width
-        sub             x1,  x1,  x9
-        sub             x3,  x3,  x9
-        sub             x8,  x8,  x9, lsl #1
-        mov             x9,  x6
-        b.lt            2f
-1:
-        ld1             {v0.8b},  [x2],  #8
-        ld1             {v16.8b}, [x11], #8
-        ld1             {v1.8h},  [x4],  #16
-        ld1             {v17.8h}, [x12], #16
-        ld1             {v2.8h},  [x5],  #16
-        ld1             {v18.8h}, [x13], #16
-        subs            x6,  x6,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        ushll           v16.8h, v16.8b, #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
-        sub             v17.8h, v17.8h, v16.8h // t1 - u
-        sub             v18.8h, v18.8h, v16.8h // t2 - u
-        ushll           v3.4s,  v0.4h,  #7     // u << 7
-        ushll2          v4.4s,  v0.8h,  #7     // u << 7
-        ushll           v19.4s, v16.4h, #7     // u << 7
-        ushll2          v20.4s, v16.8h, #7     // u << 7
-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
-        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
-        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
-        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
-        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
-        rshrn           v3.4h,  v3.4s,  #11
-        rshrn2          v3.8h,  v4.4s,  #11
-        rshrn           v19.4h, v19.4s, #11
-        rshrn2          v19.8h, v20.4s, #11
-        sqxtun          v3.8b,  v3.8h
-        sqxtun          v19.8b, v19.8h
-        st1             {v3.8b},  [x0],  #8
-        st1             {v19.8b}, [x10], #8
-        b.gt            1b
-
-        subs            x7,  x7,  #2
-        cmp             x7,  #1
-        b.lt            0f
-        mov             x6,  x9
-        add             x0,  x0,  x1
-        add             x10, x10, x1
-        add             x2,  x2,  x3
-        add             x11, x11, x3
-        add             x4,  x4,  x8
-        add             x12, x12, x8
-        add             x5,  x5,  x8
-        add             x13, x13, x8
-        b.eq            2f
-        b               1b
-
-2:
-        ld1             {v0.8b}, [x2], #8
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v2.8h}, [x5], #16
-        subs            x6,  x6,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
-        ushll           v3.4s,  v0.4h,  #7     // u << 7
-        ushll2          v4.4s,  v0.8h,  #7     // u << 7
-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
-        rshrn           v3.4h,  v3.4s,  #11
-        rshrn2          v3.8h,  v4.4s,  #11
-        sqxtun          v3.8b,  v3.8h
-        st1             {v3.8b}, [x0], #8
-        b.gt            1b
-0:
-        ret
-endfunc
+sgr_funcs 8
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
--- a/third_party/dav1d/src/arm/64/looprestoration_common.S
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@ -0,0 +1,432 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #2 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
+        add             w11, w11, #2
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v21 and v24-v26 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v24.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.4s, v19.4s}, [x5], x7
+        ld1             {v25.8h},         [x6], x8
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v25.16b, v24.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v26.16b, v24.16b
+
+3:
+        subs            w3,  w3,  #1
+.macro add3
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v24.8h,  v24.8h,  v25.8h
+        add             v16.4s,  v16.4s,  v20.4s
+        add             v17.4s,  v17.4s,  v21.4s
+        add             v24.8h,  v24.8h,  v26.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v24.8h},         [x1], x8
+.endm
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v25.16b, v26.16b
+        b.le            4f
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3b
+
+4:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        add3
+
+5:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #8 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            0f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v25 and v26-v30 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v28.8h},         [x6], x8
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v28.16b, v26.16b
+        mov             v22.16b, v16.16b
+        mov             v23.16b, v17.16b
+        mov             v29.16b, v26.16b
+
+3:
+        cbz             w3,  4f
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+
+3:
+        // Start of vertical loop
+        subs            w3,  w3,  #2
+.macro add5
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v26.8h,  v26.8h,  v27.8h
+        add             v0.4s,   v20.4s,  v22.4s
+        add             v1.4s,   v21.4s,  v23.4s
+        add             v2.8h,   v28.8h,  v29.8h
+        add             v16.4s,  v16.4s,  v24.4s
+        add             v17.4s,  v17.4s,  v25.4s
+        add             v26.8h,  v26.8h,  v30.8h
+        add             v16.4s,  v16.4s,  v0.4s
+        add             v17.4s,  v17.4s,  v1.4s
+        add             v26.8h,  v26.8h,  v2.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v26.8h},         [x1], x8
+.endm
+        add5
+.macro shift2
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v26.16b, v28.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v27.16b, v29.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v28.16b, v30.16b
+.endm
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b.le            5f
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        add5
+        b               6f
+
+5:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            6f
+        // !LR_HAVE_BOTTOM
+        cbnz            w3,  5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        // Pad the past-edge row from the last content row.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // w3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        mov             v22.16b, v20.16b
+        mov             v23.16b, v21.16b
+        mov             v29.16b, v28.16b
+        mov             v24.16b, v20.16b
+        mov             v25.16b, v21.16b
+        mov             v30.16b, v28.16b
+        add5
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+        clz             w9,  w5
+        add             x3,  x3,  #2 // h += 2
+        movi            v31.4s,   #9 // n
+        mov             x5,  #455
+        mov             x8,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        clz             w9,  w5
+        add             x3,  x3,  #3  // h += 3
+        asr             x3,  x3,  #1  // h /= 2
+        movi            v31.4s,   #25 // n
+        mov             x5,  #164
+        mov             x8,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        sub             w9,  w9,  #24  // -bitdepth_min_8
+        movrel          x12, X(sgr_x_by_x)
+        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        dup             v6.8h,    w9   // -bitdepth_min_8
+        movi            v19.16b,  #5
+        movi            v20.8b,   #55  // idx of last 5
+        movi            v21.8b,   #72  // idx of last 4
+        movi            v22.8b,   #101 // idx of last 3
+        movi            v23.8b,   #169 // idx of last 2
+        movi            v24.8b,   #254 // idx of last 1
+        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
+        add             x2,  x2,  #2 // w += 2
+        add             x7,  x2,  #7
+        bic             x7,  x7,  #7 // aligned w
+        sub             x7,  x8,  x7 // increment between rows
+        movi            v29.8h,   #1, lsl #8
+        dup             v28.4s,   w4
+        dup             v30.4s,   w5 // one_by_x
+        sub             x0,  x0,  #(4*(SUM_STRIDE))
+        sub             x1,  x1,  #(2*(SUM_STRIDE))
+        mov             x6,  x2   // backup of w
+        sub             v16.16b, v16.16b, v19.16b
+        sub             v17.16b, v17.16b, v19.16b
+        sub             v18.16b, v18.16b, v19.16b
+1:
+        subs            x2,  x2,  #8
+        ld1             {v0.4s, v1.4s}, [x0]   // a
+        ld1             {v2.8h}, [x1]          // b
+        srshl           v0.4s,  v0.4s,  v7.4s
+        srshl           v1.4s,  v1.4s,  v7.4s
+        srshl           v4.8h,  v2.8h,  v6.8h
+        mul             v0.4s,  v0.4s,  v31.4s // a * n
+        mul             v1.4s,  v1.4s,  v31.4s // a * n
+        umull           v3.4s,  v4.4h,  v4.4h  // b * b
+        umull2          v4.4s,  v4.8h,  v4.8h  // b * b
+        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
+        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
+        mul             v0.4s,  v0.4s,  v28.4s // p * s
+        mul             v1.4s,  v1.4s,  v28.4s // p * s
+        uqshrn          v0.4h,  v0.4s,  #16
+        uqshrn2         v0.8h,  v1.4s,  #16
+        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)
+
+        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
+        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
+        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
+        cmhi            v4.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        add             v25.8b, v25.8b, v26.8b
+        cmhi            v5.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b, v27.8b, v4.8b
+        add             v5.8b,  v5.8b,  v19.8b
+        add             v25.8b, v25.8b, v27.8b
+        add             v1.8b,  v1.8b,  v5.8b
+        add             v1.8b,  v1.8b,  v25.8b
+        uxtl            v1.8h,  v1.8b          // x
+
+        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
+        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
+        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        srshr           v3.4s,  v3.4s,  #12    // AA[i]
+        srshr           v4.4s,  v4.4s,  #12    // AA[i]
+        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
+
+        st1             {v3.4s, v4.4s}, [x0], #32
+        st1             {v2.8h}, [x1], #16
+        b.gt            1b
+
+        subs            x3,  x3,  #1
+        b.le            0f
+        add             x0,  x0,  x7, lsl #2
+        add             x1,  x1,  x7, lsl #1
+        mov             x2,  x6
+        b               1b
+0:
+        ret
+endfunc
--- a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@ -0,0 +1,597 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+        sub             x7,  x3,  #(4*SUM_STRIDE)
+        add             x8,  x3,  #(4*SUM_STRIDE)
+        sub             x9,  x4,  #(2*SUM_STRIDE)
+        add             x10, x4,  #(2*SUM_STRIDE)
+        mov             x11, #SUM_STRIDE
+        mov             x12, #FILTER_OUT_STRIDE
+        add             x13, x5,  #7
+        bic             x13, x13, #7 // Aligned width
+.if \bpc == 8
+        sub             x2,  x2,  x13
+.else
+        sub             x2,  x2,  x13, lsl #1
+.endif
+        sub             x12, x12, x13
+        sub             x11, x11, x13
+        sub             x11, x11, #4 // We read 4 extra elements from a
+        sub             x14, x11, #4 // We read 8 extra elements from b
+        mov             x13, x5
+        movi            v6.8h,  #3
+        movi            v7.4s,  #3
+1:
+        ld1             {v0.8h, v1.8h}, [x9], #32
+        ld1             {v2.8h, v3.8h}, [x4], #32
+        ld1             {v4.8h, v5.8h}, [x10], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
+        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
+        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
+        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
+        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
+        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
+        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
+        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
+        add             v2.8h,   v2.8h,   v26.8h
+        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
+        add             v2.8h,   v2.8h,   v29.8h      // +1
+        add             v0.8h,   v0.8h,   v4.8h
+
+        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v26.16b, v17.16b, v18.16b, #4
+        shl             v2.8h,   v2.8h,   #2
+        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v28.16b, v17.16b, v18.16b, #8
+        ext             v29.16b, v19.16b, v20.16b, #4 // 0
+        ext             v30.16b, v20.16b, v21.16b, #4
+        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
+        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
+        add             v26.4s,  v26.4s,  v20.4s
+        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v28.4s
+        ext             v27.16b, v19.16b, v20.16b, #8 // +1
+        ext             v28.16b, v20.16b, v21.16b, #8
+        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
+        add             v17.4s,  v17.4s,  v23.4s
+        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
+        add             v30.4s,  v30.4s,  v28.4s
+        add             v25.4s,  v25.4s,  v29.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
+        ext             v28.16b, v23.16b, v24.16b, #4
+        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
+        ext             v30.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+        ld1             {v19.8b}, [x1], #8            // src
+.else
+        ld1             {v19.8h}, [x1], #16           // src
+.endif
+        add             v25.4s,  v25.4s,  v27.4s      // +stride
+        add             v26.4s,  v26.4s,  v28.4s
+        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
+        add             v17.4s,  v17.4s,  v30.4s
+        shl             v25.4s,  v25.4s,  #2
+        shl             v26.4s,  v26.4s,  #2
+        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
+        mla             v26.4s,  v17.4s,  v7.4s
+.if \bpc == 8
+        uxtl            v19.8h,  v19.8b               // src
+.endif
+        mov             v0.16b,  v1.16b
+        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
+        umlal2          v26.4s,  v2.8h,   v19.8h
+        mov             v2.16b,  v3.16b
+        rshrn           v25.4h,  v25.4s,  #9
+        rshrn2          v25.8h,  v26.4s,  #9
+        mov             v4.16b,  v5.16b
+        st1             {v25.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        mov             v22.16b, v24.16b
+        ld1             {v1.8h}, [x9], #16
+        ld1             {v3.8h}, [x4], #16
+        ld1             {v5.8h}, [x10], #16
+        ld1             {v17.4s, v18.4s}, [x7], #32
+        ld1             {v20.4s, v21.4s}, [x3], #32
+        ld1             {v23.4s, v24.4s}, [x8], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x13
+        add             x0,  x0,  x12, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x11, lsl #2
+        add             x7,  x7,  x11, lsl #2
+        add             x8,  x8,  x11, lsl #2
+        add             x4,  x4,  x14, lsl #1
+        add             x9,  x9,  x14, lsl #1
+        add             x10, x10, x14, lsl #1
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+        add             x7,  x3,  #(4*(SUM_STRIDE))
+        sub             x3,  x3,  #(4*(SUM_STRIDE))
+        add             x8,  x4,  #(2*(SUM_STRIDE))
+        sub             x4,  x4,  #(2*(SUM_STRIDE))
+        mov             x9,  #(2*SUM_STRIDE)
+        mov             x10, #FILTER_OUT_STRIDE
+        add             x11, x5,  #7
+        bic             x11, x11, #7 // Aligned width
+.if \bpc == 8
+        sub             x2,  x2,  x11
+.else
+        sub             x2,  x2,  x11, lsl #1
+.endif
+        sub             x10, x10, x11
+        sub             x9,  x9,  x11
+        sub             x9,  x9,  #4 // We read 4 extra elements from a
+        sub             x12, x9,  #4 // We read 8 extra elements from b
+        mov             x11, x5
+        movi            v4.8h,  #5
+        movi            v5.4s,  #5
+        movi            v6.8h,  #6
+        movi            v7.4s,  #6
+1:
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v2.8h, v3.8h}, [x8], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
+        add             v0.8h,   v0.8h,   v25.8h
+
+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v23.16b, v17.16b, v18.16b, #4
+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
+        ext             v25.16b, v20.16b, v21.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v27.16b, v17.16b, v18.16b, #8
+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
+        ext             v29.16b, v20.16b, v21.16b, #8
+        mul             v0.8h,   v0.8h,   v4.8h       // * 5
+        mla             v0.8h,   v2.8h,   v6.8h       // * 6
+.if \bpc == 8
+        ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v27.4s
+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
+        add             v20.4s,  v20.4s,  v29.4s
+        add             v16.4s,  v16.4s,  v19.4s
+        add             v17.4s,  v17.4s,  v20.4s
+
+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
+        add             v23.4s,  v23.4s,  v25.4s
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v16.4s,  v16.4s,  v5.4s       // * 5
+        mla             v16.4s,  v22.4s,  v7.4s       // * 6
+        mul             v17.4s,  v17.4s,  v5.4s       // * 5
+        mla             v17.4s,  v23.4s,  v7.4s       // * 6
+
+.if \bpc == 8
+        uxtl            v31.8h,  v31.8b
+.endif
+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
+        umlal2          v17.4s,  v0.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v16.4h,  v16.4s,  #9
+        rshrn2          v16.8h,  v17.4s,  #9
+        mov             v2.16b,  v3.16b
+        st1             {v16.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v3.8h}, [x8], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        ld1             {v20.4s, v21.4s}, [x7], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x9, lsl #2
+        add             x7,  x7,  x9, lsl #2
+        add             x4,  x4,  x12, lsl #1
+        add             x8,  x8,  x12, lsl #1
+        mov             x13, x3
+        mov             x14, x4
+
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+        subs            x5,  x5,  #8
+        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
+        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
+        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
+
+        ext             v24.16b, v16.16b, v17.16b, #4 // 0
+        ext             v25.16b, v17.16b, v18.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1
+        ext             v27.16b, v17.16b, v18.16b, #8
+        mul             v2.8h,   v22.8h,  v6.8h       // * 6
+        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
+.if \bpc == 8
+        ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
+        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
+        add             v17.4s,  v17.4s,  v27.4s
+.if \bpc == 8
+        uxtl            v31.8h,  v31.8b
+.endif
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v24.4s,  v24.4s,  v7.4s       // * 6
+        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
+        mul             v25.4s,  v25.4s,  v7.4s       // * 6
+        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
+
+        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
+        umlal2          v25.4s,  v2.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v24.4h,  v24.4s,  #8
+        rshrn2          v24.8h,  v25.4s,  #8
+        mov             v16.16b, v18.16b
+        st1             {v24.8h}, [x0], #16
+
+        b.le            5f
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        b               4b
+
+5:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        mov             x3,  x13 // Rewind x3/x4 to where they started
+        mov             x4,  x14
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+        ldr             w8,  [sp]
+.endif
+        dup             v31.8h, w7
+        cmp             x6,  #2
+.if \bpc == 16
+        dup             v30.8h, w8
+.endif
+        add             x9,  x0,  x1
+        add             x10, x2,  x3
+        add             x11, x4,  #2*FILTER_OUT_STRIDE
+        mov             x7,  #(4*FILTER_OUT_STRIDE)
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x8,  x5,  #7
+        bic             x8,  x8,  #7 // Aligned width
+.if \bpc == 8
+        sub             x1,  x1,  x8
+        sub             x3,  x3,  x8
+.else
+        sub             x1,  x1,  x8, lsl #1
+        sub             x3,  x3,  x8, lsl #1
+.endif
+        sub             x7,  x7,  x8, lsl #1
+        mov             x8,  x5
+        b.lt            2f
+1:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2],  #8
+        ld1             {v4.8b}, [x10], #8
+.else
+        ld1             {v0.8h}, [x2],  #16
+        ld1             {v4.8h}, [x10], #16
+.endif
+        ld1             {v1.8h}, [x4],  #16
+        ld1             {v5.8h}, [x11], #16
+        subs            x5,  x5,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v4.8h,  v4.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v4.8h,  v4.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        ushll           v6.4s,  v4.4h,  #7     // u << 7
+        ushll2          v7.4s,  v4.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+        smlal           v6.4s,  v5.4h,  v31.4h // v
+        smlal2          v7.4s,  v5.8h,  v31.8h // v
+.if \bpc == 8
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        rshrn           v6.4h,  v6.4s,  #11
+        rshrn2          v6.8h,  v7.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v6.8b,  v6.8h
+        st1             {v2.8b}, [x0], #8
+        st1             {v6.8b}, [x9], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        sqrshrun        v6.4h,  v6.4s,  #11
+        sqrshrun2       v6.8h,  v7.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        umin            v6.8h,  v6.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+        st1             {v6.8h}, [x9], #16
+.endif
+        b.gt            1b
+
+        sub             x6,  x6,  #2
+        cmp             x6,  #1
+        b.lt            0f
+        mov             x5,  x8
+        add             x0,  x0,  x1
+        add             x9,  x9,  x1
+        add             x2,  x2,  x3
+        add             x10, x10, x3
+        add             x4,  x4,  x7
+        add             x11, x11, x7
+        b.eq            2f
+        b               1b
+
+2:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
+        ld1             {v1.8h}, [x4], #16
+        subs            x5,  x5,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+.if \bpc == 8
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.8b}, [x0], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+.endif
+        b.gt            2b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+        ldr             x8,  [sp]
+.else
+        ldp             x8,  x9,  [sp]
+.endif
+        cmp             x7,  #2
+        add             x10, x0,  x1
+        add             x11, x2,  x3
+        add             x12, x4,  #2*FILTER_OUT_STRIDE
+        add             x13, x5,  #2*FILTER_OUT_STRIDE
+        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+        dup             v29.8h,  w9
+.endif
+        mov             x8,  #4*FILTER_OUT_STRIDE
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x9,  x6,  #7
+        bic             x9,  x9,  #7 // Aligned width
+.if \bpc == 8
+        sub             x1,  x1,  x9
+        sub             x3,  x3,  x9
+.else
+        sub             x1,  x1,  x9, lsl #1
+        sub             x3,  x3,  x9, lsl #1
+.endif
+        sub             x8,  x8,  x9, lsl #1
+        mov             x9,  x6
+        b.lt            2f
+1:
+.if \bpc == 8
+        ld1             {v0.8b},  [x2],  #8
+        ld1             {v16.8b}, [x11], #8
+.else
+        ld1             {v0.8h},  [x2],  #16
+        ld1             {v16.8h}, [x11], #16
+.endif
+        ld1             {v1.8h},  [x4],  #16
+        ld1             {v17.8h}, [x12], #16
+        ld1             {v2.8h},  [x5],  #16
+        ld1             {v18.8h}, [x13], #16
+        subs            x6,  x6,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v16.8h, v16.8b, #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v16.8h, v16.8h, #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        sub             v17.8h, v17.8h, v16.8h // t1 - u
+        sub             v18.8h, v18.8h, v16.8h // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        ushll           v19.4s, v16.4h, #7     // u << 7
+        ushll2          v20.4s, v16.8h, #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        rshrn           v19.4h, v19.4s, #11
+        rshrn2          v19.8h, v20.4s, #11
+        sqxtun          v3.8b,  v3.8h
+        sqxtun          v19.8b, v19.8h
+        st1             {v3.8b},  [x0],  #8
+        st1             {v19.8b}, [x10], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        sqrshrun        v19.4h, v19.4s, #11
+        sqrshrun2       v19.8h, v20.4s, #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        umin            v19.8h, v19.8h, v29.8h
+        st1             {v3.8h},  [x0],  #16
+        st1             {v19.8h}, [x10], #16
+.endif
+        b.gt            1b
+
+        subs            x7,  x7,  #2
+        cmp             x7,  #1
+        b.lt            0f
+        mov             x6,  x9
+        add             x0,  x0,  x1
+        add             x10, x10, x1
+        add             x2,  x2,  x3
+        add             x11, x11, x3
+        add             x4,  x4,  x8
+        add             x12, x12, x8
+        add             x5,  x5,  x8
+        add             x13, x13, x8
+        b.eq            2f
+        b               1b
+
+2:
+.if \bpc == 8
+        ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v2.8h}, [x5], #16
+        subs            x6,  x6,  #8
+.if \bpc == 8
+        ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        sqxtun          v3.8b,  v3.8h
+        st1             {v3.8b}, [x0], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        st1             {v3.8h}, [x0], #16
+.endif
+        b.gt            1b
+0:
+        ret
+endfunc
+.endm
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -29,14 +29,7 @@
 #include "src/arm/asm.S"
 #include "util.S"

-.macro avg dst, t0, t1
-        ld1             {\t0\().8h},   [x2],  16
-        ld1             {\t1\().8h},   [x3],  16
-        add             \t0\().8h,   \t0\().8h,   \t1\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #5
-.endm
-
-.macro avg16 dst, t0, t1, t2, t3
+.macro avg dst, t0, t1, t2, t3
        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
        add             \t0\().8h,   \t0\().8h,   \t2\().8h
@ -45,16 +38,7 @@
        sqrshrun2       \dst\().16b, \t1\().8h,   #5
 .endm

-.macro w_avg dst, t0, t1
-        ld1             {\t0\().8h},   [x2],  16
-        ld1             {\t1\().8h},   [x3],  16
-        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
-        add             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #4
-.endm
-
-.macro w_avg16 dst, t0, t1, t2, t3
+.macro w_avg dst, t0, t1, t2, t3
        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
@ -67,19 +51,7 @@
        sqrshrun2       \dst\().16b, \t1\().8h,   #4
 .endm

-.macro mask dst, t0, t1
-        ld1             {v30.8b},      [x6],  8
-        ld1             {\t0\().8h},   [x2],  16
-        mul             v30.8b, v30.8b, v31.8b
-        ld1             {\t1\().8h},   [x3],  16
-        shll            v30.8h, v30.8b, #8
-        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
-        add             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #4
-.endm
-
-.macro mask16 dst, t0, t1, t2, t3
+.macro mask dst, t0, t1, t2, t3
        ld1             {v30.16b}, [x6],  16
        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
        mul             v30.16b, v30.16b, v31.16b
@ -109,113 +81,108 @@ function \type\()_8bpc_neon, export=1
 .endif
        adr             x7,  L(\type\()_tbl)
        sub             w4,  w4,  #24
-        \type           v4,  v0,  v1
        ldrh            w4,  [x7, x4, lsl #1]
-        \type           v5,  v2,  v3
+        \type           v4,  v0,  v1,  v2,  v3
        sub             x7,  x7,  w4, uxtw
        br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 4:
        cmp             w5,  #4
        st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
-        st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
        b.eq            0f
-        \type           v6,  v0,  v1
-        \type           v7,  v2,  v3
+        \type           v5,  v0,  v1,  v2,  v3
        cmp             w5,  #8
-        st1             {v6.s}[0],  [x0], x1
-        st1             {v6.s}[1],  [x0], x1
-        st1             {v7.s}[0],  [x0], x1
-        st1             {v7.s}[1],  [x0], x1
-        b.eq            0f
-        \type           v4,  v0,  v1
-        \type           v5,  v2,  v3
-        st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
-        \type           v6,  v0,  v1
        st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
-        \type           v7,  v2,  v3
-        st1             {v6.s}[0],  [x0], x1
-        st1             {v6.s}[1],  [x0], x1
-        st1             {v7.s}[0],  [x0], x1
-        st1             {v7.s}[1],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
+        b.eq            0f
+        \type           v4,  v0,  v1,  v2,  v3
+        st1             {v4.s}[0],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
        ret
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 8:
-        st1             {v4.8b},  [x0], x1
-        \type           v6,  v0,  v1
-        st1             {v5.8b},  [x0], x1
-        \type           v7,  v0,  v1
-        st1             {v6.8b},  [x0], x1
+        st1             {v4.d}[0],  [x0], x1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.d}[1],  [x7], x1
+        st1             {v5.d}[0],  [x0], x1
        subs            w5,  w5,  #4
-        st1             {v7.8b},  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
        b.le            0f
-        \type           v4,  v0,  v1
-        \type           v5,  v2,  v3
+        \type           v4,  v0,  v1,  v2,  v3
        b               8b
-160:
-        trn1            v4.2d,  v4.2d,  v5.2d
 16:
-        \type\()16      v5, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
        st1             {v4.16b}, [x0], x1
-        \type\()16      v6, v0, v1, v2, v3
+        \type           v6,  v0,  v1,  v2,  v3
        st1             {v5.16b}, [x0], x1
-        \type\()16      v7, v0, v1, v2, v3
+        \type           v7,  v0,  v1,  v2,  v3
        st1             {v6.16b}, [x0], x1
        subs            w5,  w5,  #4
        st1             {v7.16b}, [x0], x1
        b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4,  v0,  v1,  v2,  v3
        b               16b
 320:
-        trn1            v4.2d,  v4.2d,  v5.2d
        add             x7,  x0,  x1
        lsl             x1,  x1,  #1
 32:
-        \type\()16      v5, v0, v1, v2, v3
-        \type\()16      v6, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
        st1             {v4.16b,v5.16b}, [x0], x1
-        \type\()16      v7, v0, v1, v2, v3
+        \type           v7,  v0,  v1,  v2,  v3
        subs            w5,  w5,  #2
        st1             {v6.16b,v7.16b}, [x7], x1
        b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4,  v0,  v1,  v2,  v3
        b               32b
 640:
-        trn1            v4.2d,  v4.2d,  v5.2d
        add             x7,  x0,  x1
        lsl             x1,  x1,  #1
 64:
-        \type\()16      v5,  v0, v1, v2, v3
-        \type\()16      v6,  v0, v1, v2, v3
-        \type\()16      v7,  v0, v1, v2, v3
-        \type\()16      v16, v0, v1, v2, v3
-        \type\()16      v17, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
-        \type\()16      v18, v0, v1, v2, v3
-        \type\()16      v19, v0, v1, v2, v3
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
        subs            w5,  w5,  #2
        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
        b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4, v0,  v1,  v2,  v3
        b               64b
 1280:
-        trn1            v4.2d,  v4.2d,  v5.2d
        add             x7,  x0,  #64
 128:
-        \type\()16      v5,  v0, v1, v2, v3
-        \type\()16      v6,  v0, v1, v2, v3
-        \type\()16      v7,  v0, v1, v2, v3
-        \type\()16      v16, v0, v1, v2, v3
-        \type\()16      v17, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
-        \type\()16      v18, v0, v1, v2, v3
-        \type\()16      v19, v0, v1, v2, v3
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
        subs            w5,  w5,  #1
        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
        b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4, v0,  v1,  v2,  v3
        b               128b
 0:
        ret
@ -223,9 +190,9 @@ L(\type\()_tbl):
        .hword L(\type\()_tbl) - 1280b
        .hword L(\type\()_tbl) -  640b
        .hword L(\type\()_tbl) -  320b
-        .hword L(\type\()_tbl) -  160b
-        .hword L(\type\()_tbl) -    8b
-        .hword L(\type\()_tbl) -    4b
+        .hword L(\type\()_tbl) -   16b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
 endfunc
 .endm

@ -464,10 +431,10 @@ function blend_8bpc_neon, export=1
        sub             x6,  x6,  w3,  uxtw
        movi            v4.16b,  #64
        add             x8,  x0,  x1
-        lsl             w1,  w1,  #1
+        lsl             x1,  x1,  #1
        br              x6
 4:
-        ld1             {v2.d}[0],   [x5],  #8
+        ld1             {v2.8b},     [x5],  #8
        ld1             {v1.d}[0],   [x2],  #8
        ld1             {v0.s}[0],   [x0]
        subs            w4,  w4,  #2
@ -481,8 +448,8 @@ function blend_8bpc_neon, export=1
        b.gt            4b
        ret
 8:
-        ld1             {v2.2d},   [x5],  #16
-        ld1             {v1.2d},   [x2],  #16
+        ld1             {v2.16b},  [x5],  #16
+        ld1             {v1.16b},  [x2],  #16
        ld1             {v0.d}[0],   [x0]
        ld1             {v0.d}[1],   [x8]
        sub             v3.16b,  v4.16b,  v2.16b
@ -498,13 +465,13 @@ function blend_8bpc_neon, export=1
        b.gt            8b
        ret
 16:
-        ld1             {v1.2d,   v2.2d},   [x5],  #32
-        ld1             {v5.2d,   v6.2d},   [x2],  #32
-        ld1             {v0.2d},   [x0]
+        ld1             {v1.16b,  v2.16b},  [x5],  #32
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v0.16b},  [x0]
        subs            w4,  w4,  #2
        sub             v7.16b,  v4.16b,  v1.16b
        sub             v20.16b, v4.16b,  v2.16b
-        ld1             {v3.2d},   [x8]
+        ld1             {v3.16b},  [x8]
        umull           v16.8h,  v5.8b,   v1.8b
        umlal           v16.8h,  v0.8b,   v7.8b
        umull2          v17.8h,  v5.16b,  v1.16b
@ -517,16 +484,16 @@ function blend_8bpc_neon, export=1
        rshrn2          v18.16b, v17.8h,  #6
        rshrn           v19.8b,  v21.8h,  #6
        rshrn2          v19.16b, v22.8h,  #6
-        st1             {v18.2d},  [x0],  x1
-        st1             {v19.2d},  [x8],  x1
+        st1             {v18.16b}, [x0],  x1
+        st1             {v19.16b}, [x8],  x1
        b.gt            16b
        ret
 32:
-        ld1             {v0.2d,   v1.2d,   v2.2d,   v3.2d},   [x5],  #64
-        ld1             {v16.2d,  v17.2d,  v18.2d,  v19.2d},  [x2],  #64
-        ld1             {v20.2d,  v21.2d},  [x0]
+        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v20.16b, v21.16b}, [x0]
        subs            w4,  w4,  #2
-        ld1             {v22.2d,  v23.2d},  [x8]
+        ld1             {v22.16b, v23.16b}, [x8]
        sub             v5.16b,  v4.16b,  v0.16b
        sub             v6.16b,  v4.16b,  v1.16b
        sub             v30.16b, v4.16b,  v2.16b
@ -555,8 +522,8 @@ function blend_8bpc_neon, export=1
        rshrn2          v27.16b, v1.8h,   #6
        rshrn           v28.8b,  v29.8h,  #6
        rshrn2          v28.16b, v21.8h,  #6
-        st1             {v24.2d, v25.2d}, [x0],  x1
-        st1             {v27.2d, v28.2d}, [x8],  x1
+        st1             {v24.16b, v25.16b}, [x0],  x1
+        st1             {v27.16b, v28.16b}, [x8],  x1
        b.gt            32b
        ret
 L(blend_tbl):
@ -567,7 +534,7 @@ L(blend_tbl):
 endfunc

 function blend_h_8bpc_neon, export=1
-        adr             x6, L(blend_h_tbl)
+        adr             x6,  L(blend_h_tbl)
        movrel          x5,  X(obmc_masks)
        add             x5,  x5,  w4,  uxtw
        sub             w4,  w4,  w4,  lsr #2
@ -596,7 +563,7 @@ function blend_h_8bpc_neon, export=1
        ret
 4:
        ld2r            {v0.8b,   v1.8b},   [x5],  #2
-        ld1             {v2.2s},   [x2],  #8
+        ld1             {v2.8b},   [x2],  #8
        subs            w4,  w4,  #2
        ext             v0.8b,   v0.8b,   v1.8b,   #4
        ld1             {v3.s}[0],   [x0]
@ -742,8 +709,8 @@ function blend_v_8bpc_neon, export=1
        ret
 40:
        ld1r            {v0.2s},   [x5]
+        sub             x1,  x1,  #2
        sub             v1.8b,   v4.8b,   v0.8b
-        sub             x1,  x1,  #3
 4:
        ld1             {v2.8b},   [x2],  #8
        ld1             {v3.s}[0],   [x0]
@ -754,16 +721,14 @@ function blend_v_8bpc_neon, export=1
        rshrn           v5.8b,   v5.8h,   #6
        st1             {v5.h}[0],   [x0],  #2
        st1             {v5.h}[2],   [x8],  #2
-        st1             {v5.b}[2],   [x0],  #1
-        st1             {v5.b}[6],   [x8],  #1
-        add             x0,  x0,  x1
-        add             x8,  x8,  x1
+        st1             {v5.b}[2],   [x0],  x1
+        st1             {v5.b}[6],   [x8],  x1
        b.gt            4b
        ret
 80:
        ld1r            {v0.2d},   [x5]
+        sub             x1,  x1,  #4
        sub             v1.16b,  v4.16b,  v0.16b
-        sub             x1,  x1,  #6
 8:
        ld1             {v2.16b},  [x2],  #16
        ld1             {v3.d}[0],   [x0]
@ -777,16 +742,14 @@ function blend_v_8bpc_neon, export=1
        rshrn2          v7.16b, v6.8h,  #6
        st1             {v7.s}[0],   [x0],  #4
        st1             {v7.s}[2],   [x8],  #4
-        st1             {v7.h}[2],   [x0],  #2
-        st1             {v7.h}[6],   [x8],  #2
-        add             x0,  x0,  x1
-        add             x8,  x8,  x1
+        st1             {v7.h}[2],   [x0],  x1
+        st1             {v7.h}[6],   [x8],  x1
        b.gt            8b
        ret
 160:
        ld1             {v0.16b},  [x5]
+        sub             x1,  x1,  #8
        sub             v2.16b,  v4.16b,  v0.16b
-        sub             x1,  x1,  #12
 16:
        ld1             {v5.16b,  v6.16b},  [x2],  #32
        ld1             {v7.16b},  [x0]
@ -806,17 +769,15 @@ function blend_v_8bpc_neon, export=1
        rshrn2          v22.16b, v21.8h,  #6
        st1             {v19.8b},  [x0],  #8
        st1             {v22.8b},  [x8],  #8
-        st1             {v19.s}[2],  [x0],  #4
-        st1             {v22.s}[2],  [x8],  #4
-        add             x0,  x0,  x1
-        add             x8,  x8,  x1
+        st1             {v19.s}[2],  [x0],  x1
+        st1             {v22.s}[2],  [x8],  x1
        b.gt            16b
        ret
 320:
        ld1             {v0.16b,  v1.16b},  [x5]
+        sub             x1,  x1,  #16
        sub             v2.16b,  v4.16b,  v0.16b
-        sub             v3.16b,  v4.16b,  v1.16b
-        sub             x1,  x1,  #24
+        sub             v3.8b,   v4.8b,   v1.8b
 32:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
        ld1             {v5.16b,  v6.16b},  [x0]
@ -828,30 +789,22 @@ function blend_v_8bpc_neon, export=1
        umlal2          v23.8h,  v5.16b,  v2.16b
        umull           v28.8h,  v17.8b,  v1.8b
        umlal           v28.8h,  v6.8b,   v3.8b
-        umull2          v29.8h,  v17.16b, v1.16b
-        umlal2          v29.8h,  v6.16b,  v3.16b
        umull           v30.8h,  v18.8b,  v0.8b
        umlal           v30.8h,  v20.8b,  v2.8b
        umull2          v31.8h,  v18.16b, v0.16b
        umlal2          v31.8h,  v20.16b, v2.16b
        umull           v25.8h,  v19.8b,  v1.8b
        umlal           v25.8h,  v21.8b,  v3.8b
-        umull2          v26.8h,  v19.16b, v1.16b
-        umlal2          v26.8h,  v21.16b, v3.16b
        rshrn           v24.8b,  v22.8h,  #6
        rshrn2          v24.16b, v23.8h,  #6
        rshrn           v28.8b,  v28.8h,  #6
-        rshrn2          v28.16b, v29.8h,  #6
        rshrn           v30.8b,  v30.8h,  #6
        rshrn2          v30.16b, v31.8h,  #6
        rshrn           v27.8b,  v25.8h,  #6
-        rshrn2          v27.16b, v26.8h,  #6
        st1             {v24.16b}, [x0],  #16
        st1             {v30.16b}, [x8],  #16
-        st1             {v28.8b},  [x0],  #8
-        st1             {v27.8b},  [x8],  #8
-        add             x0,  x0,  x1
-        add             x8,  x8,  x1
+        st1             {v28.8b},  [x0],  x1
+        st1             {v27.8b},  [x8],  x1
        b.gt            32b
        ret
 L(blend_v_tbl):
@ -2106,9 +2059,9 @@ L(\type\()_8tap_filter_2):
        st1             {v3.4h}, [\ds2], \d_strd
 .endif
        b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v28.16b
-        mov             v18.16b, v29.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v28.8b
+        mov             v18.8b,  v29.8b
        b               4b

 480:    // 4x8, 4x16, 4x32 hv
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@ -110,25 +110,10 @@ endconst
 .endif
 .endm

-.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
-        umull           \d0\().4s, \s0\().4h,  \s2\().4h
-.if \n >= 8
-        umull2          \d1\().4s, \s0\().8h,  \s2\().8h
-.endif
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
-        umull           \d2\().4s, \s1\().4h,  \s3\().4h
-        umull2          \d3\().4s, \s1\().8h,  \s3\().8h
-.endif
-.endm
-
-.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
-        shrn            \d0\().4h,  \s0\().4s, \shift
-.if \n >= 8
-        shrn2           \d0\().8h,  \s1\().4s, \shift
-.endif
-.if \n == 16
-        shrn            \d1\().4h,  \s2\().4s, \shift
-        shrn2           \d1\().8h,  \s3\().4s, \shift
+        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

@ -149,17 +134,19 @@ function msac_decode_symbol_adapt4_neon, export=1
        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
        ld1r            {v4\sz},  [x8]                            // rng
        movrel          x9,  coeffs, 30
+        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
        sub             x9,  x9,  x2, lsl #1
-        ushr_n          v2,  v3,  v0,  v1,  #6, \sz, \n           // cdf >> EC_PROB_SHIFT
+        mvni            v30\sz, #0x3f                             // 0xffc0
+        and             v7\szb, v4\szb, v31\szb                   // rng & 0x7f00
        str             h4,  [sp, #14]                            // store original u = s->rng
-        ushr            v4\sz,  v4\sz,  #8                        // r = rng >> 8
+        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0

-        umull_n         v16, v17, v18, v19, v4,  v4,  v2,  v3, \n // r * (cdf >> EC_PROB_SHIFT)
        ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
-        shrn_n          v2,  v3,  v16, v17, v18, v19, #1, \n      // v >>= 7 - EC_PROB_SHIFT
+        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
        add             x8,  x0,  #DIF + 6

-        add_n           v4,  v5,  v2,  v3,  v4,  v5, \sz, \n      // v += EC_MIN_PROB * (n_symbols - ret)
+        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

        ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
        movrel          x8,  bits
--- a/third_party/dav1d/src/arm/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c
@ -27,42 +27,46 @@
 #include "src/cpu.h"
 #include "src/cdef.h"

-#if BITDEPTH == 8
-decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));

-void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
-                              ptrdiff_t src_stride, const pixel (*left)[2],
-                              /*const*/ pixel *const top[2], int h,
-                              enum CdefEdgeFlags edges);
-void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
-                              ptrdiff_t src_stride, const pixel (*left)[2],
-                              /*const*/ pixel *const top[2], int h,
-                              enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+                                   ptrdiff_t src_stride, const pixel (*left)[2],
+                                   const pixel *const top, int h,
+                                   enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+                                   ptrdiff_t src_stride, const pixel (*left)[2],
+                                   const pixel *const top, int h,
+                                   enum CdefEdgeFlags edges);

-void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
-                             const uint16_t *tmp, int pri_strength,
-                             int sec_strength, int dir, int damping, int h);
-void dav1d_cdef_filter8_neon(pixel *dst, ptrdiff_t dst_stride,
-                             const uint16_t *tmp, int pri_strength,
-                             int sec_strength, int dir, int damping, int h);
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+                                  const uint16_t *tmp, int pri_strength,
+                                  int sec_strength, int dir, int damping, int h,
+                                  size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+                                  const uint16_t *tmp, int pri_strength,
+                                  int sec_strength, int dir, int damping, int h,
+                                  size_t edges HIGHBD_DECL_SUFFIX);

 #define DEFINE_FILTER(w, h, tmp_stride)                                      \
 static void                                                                  \
 cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
                             const ptrdiff_t stride,                         \
-                             const pixel (*left)[2],                         \
-                             /*const*/ pixel *const top[2],                  \
-                             const int pri_strength,                         \
-                             const int sec_strength,                         \
-                             const int dir,                                  \
-                             const int damping,                              \
-                             const enum CdefEdgeFlags edges)                 \
+                             const pixel (*left)[2], const pixel *const top, \
+                             const int pri_strength, const int sec_strength, \
+                             const int dir, const int damping,               \
+                             const enum CdefEdgeFlags edges                  \
+                             HIGHBD_DECL_SUFFIX)                             \
 {                                                                            \
-    ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,);                     \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,);                   \
    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8;                            \
-    dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges);     \
-    dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength,              \
-                                sec_strength, dir, damping, h);              \
+    BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges);  \
+    BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength,           \
+                                   sec_strength, dir, damping, h, edges      \
+                                   HIGHBD_TAIL_SUFFIX);                      \
 }

 DEFINE_FILTER(8, 8, 16)
@ -76,8 +80,8 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8
-    c->dir = dav1d_cdef_find_dir_neon;
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->dir = BF(dav1d_cdef_find_dir, neon);
    c->fb[0] = cdef_filter_8x8_neon;
    c->fb[1] = cdef_filter_4x8_neon;
    c->fb[2] = cdef_filter_4x4_neon;
--- a/third_party/dav1d/src/arm/loopfilter_init_tmpl.c
+++ b/third_party/dav1d/src/arm/loopfilter_init_tmpl.c
@ -28,20 +28,20 @@
 #include "src/cpu.h"
 #include "src/loopfilter.h"

-decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
-decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));

 COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8
-    c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
-    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
-    c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
-    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+    c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
 #endif
 }
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@ -29,19 +29,26 @@
 #include "src/looprestoration.h"
 #include "src/tables.h"

-#if BITDEPTH == 8
-// This calculates things slightly differently than the reference C version.
-// This version calculates roughly this:
+#if BITDEPTH == 8 || ARCH_AARCH64
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
 // int16_t sum = 0;
 // for (int i = 0; i < 7; i++)
 //     sum += src[idx] * fh[i];
-// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
-// sum += 2048;
-void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-                                const pixel *src, ptrdiff_t stride,
-                                const int16_t fh[7], const intptr_t w,
-                                int h, enum LrEdgeFlags edges);
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+                                     const pixel *src, ptrdiff_t stride,
+                                     const int16_t fh[7], const intptr_t w,
+                                     int h, enum LrEdgeFlags edges
+                                     HIGHBD_DECL_SUFFIX);
 // This calculates things slightly differently than the reference C version.
 // This version calculates roughly this:
 // fv[3] += 128;
@ -50,217 +57,242 @@ void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
 //     sum += mid[idx] * fv[i];
 // sum = (sum + rounding_off_v) >> round_bits_v;
 // This function assumes that the width is a multiple of 8.
-void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-                                const int16_t *mid, int w, int h,
-                                const int16_t fv[7], enum LrEdgeFlags edges,
-                                ptrdiff_t mid_stride);
-void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-                            const pixel *src, int w, int h);
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+                                     const int16_t *mid, int w, int h,
+                                     const int16_t fv[7], enum LrEdgeFlags edges,
+                                     ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
+                                 const pixel *src, int w, int h);

 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                               const pixel (*const left)[4],
                               const pixel *lpf, const ptrdiff_t lpf_stride,
                               const int w, const int h, const int16_t fh[7],
-                               const int16_t fv[7], const enum LrEdgeFlags edges)
+                               const int16_t fv[7], const enum LrEdgeFlags edges
+                               HIGHBD_DECL_SUFFIX)
 {
    ALIGN_STK_16(int16_t, mid, 68 * 384,);
    int mid_stride = (w + 7) & ~7;

    // Horizontal filter
-    dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
-                               fh, w, h, edges);
+    BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
+                                    fh, w, h, edges HIGHBD_TAIL_SUFFIX);
    if (edges & LR_HAVE_TOP)
-        dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
-                                   fh, w, 2, edges);
+        BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
+                                        fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
    if (edges & LR_HAVE_BOTTOM)
-        dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
-                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
-                                   fh, w, 2, edges);
+        BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+                                        lpf + 6 * PXSTRIDE(lpf_stride),
+                                        lpf_stride, fh, w, 2, edges
+                                        HIGHBD_TAIL_SUFFIX);

    // Vertical filter
    if (w >= 8)
-        dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
-                                   w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
+        BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
+                                        w & ~7, h, fv, edges,
+                                        mid_stride * sizeof(*mid)
+                                        HIGHBD_TAIL_SUFFIX);
    if (w & 7) {
        // For uneven widths, do a full 8 pixel wide filtering into a temp
        // buffer and copy out the narrow slice of pixels separately into dest.
        ALIGN_STK_16(pixel, tmp, 64 * 8,);
-        dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
-                                   w & 7, h, fv, edges, mid_stride * sizeof(*mid));
-        dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
+        BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
+                                        &mid[2*mid_stride + (w & ~7)],
+                                        w & 7, h, fv, edges,
+                                        mid_stride * sizeof(*mid)
+                                        HIGHBD_TAIL_SUFFIX);
+        BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
    }
 }

-void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
-                           const pixel (*left)[4],
-                           const pixel *src, const ptrdiff_t stride,
-                           const int w, const int h,
-                           const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+                                const pixel (*left)[4],
+                                const pixel *src, const ptrdiff_t stride,
+                                const int w, const int h,
+                                const enum LrEdgeFlags edges);
 void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
                           const int w, const int h,
                           const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-                             const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter1_neon(coef *tmp,
-                                   const pixel *src, const ptrdiff_t stride,
-                                   const int32_t *a, const int16_t *b,
-                                   const int w, const int h);
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+                                        const pixel *src, const ptrdiff_t stride,
+                                        const int32_t *a, const int16_t *b,
+                                        const int w, const int h);

 /* filter with a 3x3 box (radius=1) */
-static void dav1d_sgr_filter1_neon(coef *tmp,
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
                                   const pixel *src, const ptrdiff_t stride,
                                   const pixel (*left)[4],
                                   const pixel *lpf, const ptrdiff_t lpf_stride,
                                   const int w, const int h, const int strength,
-                                   const enum LrEdgeFlags edges)
+                                   const enum LrEdgeFlags edges
+                                   HIGHBD_DECL_SUFFIX)
 {
    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;

-    dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
+    BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
    if (edges & LR_HAVE_TOP)
-        dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
-                              NULL, lpf, lpf_stride, w, 2, edges);
+        BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+                                   NULL, lpf, lpf_stride, w, 2, edges);

    if (edges & LR_HAVE_BOTTOM)
-        dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
-                              NULL, lpf + 6 * PXSTRIDE(lpf_stride),
-                              lpf_stride, w, 2, edges);
+        BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);

    dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
-    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
-    dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
+    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+    BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
 }

-void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
-                           const pixel (*left)[4],
-                           const pixel *src, const ptrdiff_t stride,
-                           const int w, const int h,
-                           const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+                                const pixel (*left)[4],
+                                const pixel *src, const ptrdiff_t stride,
+                                const int w, const int h,
+                                const enum LrEdgeFlags edges);
 void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
                           const int w, const int h,
                           const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-                             const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter2_neon(coef *tmp,
-                                   const pixel *src, const ptrdiff_t stride,
-                                   const int32_t *a, const int16_t *b,
-                                   const int w, const int h);
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+                                        const pixel *src, const ptrdiff_t stride,
+                                        const int32_t *a, const int16_t *b,
+                                        const int w, const int h);

 /* filter with a 5x5 box (radius=2) */
-static void dav1d_sgr_filter2_neon(coef *tmp,
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
                                   const pixel *src, const ptrdiff_t stride,
                                   const pixel (*left)[4],
                                   const pixel *lpf, const ptrdiff_t lpf_stride,
                                   const int w, const int h, const int strength,
-                                   const enum LrEdgeFlags edges)
+                                   const enum LrEdgeFlags edges
+                                   HIGHBD_DECL_SUFFIX)
 {
    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;

-    dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
+    BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
    if (edges & LR_HAVE_TOP)
-        dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
-                              NULL, lpf, lpf_stride, w, 2, edges);
+        BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+                                   NULL, lpf, lpf_stride, w, 2, edges);

    if (edges & LR_HAVE_BOTTOM)
-        dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
-                              NULL, lpf + 6 * PXSTRIDE(lpf_stride),
-                              lpf_stride, w, 2, edges);
+        BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);

    dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
-    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
-    dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
+    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+    BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
 }

-void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
-                              const pixel *src, const ptrdiff_t src_stride,
-                              const coef *t1, const int w, const int h,
-                              const int wt);
-void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
-                              const pixel *src, const ptrdiff_t src_stride,
-                              const coef *t1, const coef *t2,
-                              const int w, const int h,
-                              const int16_t wt[2]);
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+                                   const pixel *src, const ptrdiff_t src_stride,
+                                   const int16_t *t1, const int w, const int h,
+                                   const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+                                   const pixel *src, const ptrdiff_t src_stride,
+                                   const int16_t *t1, const int16_t *t2,
+                                   const int w, const int h,
+                                   const int16_t wt[2] HIGHBD_DECL_SUFFIX);

 static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                             const pixel (*const left)[4],
                             const pixel *lpf, const ptrdiff_t lpf_stride,
                             const int w, const int h, const int sgr_idx,
-                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
+                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges
+                             HIGHBD_DECL_SUFFIX)
 {
    if (!dav1d_sgr_params[sgr_idx][0]) {
-        ALIGN_STK_16(coef, tmp, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
        dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges);
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges
+                               HIGHBD_TAIL_SUFFIX);
        if (w >= 8)
-            dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
-                                     tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
+            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
+                                          HIGHBD_TAIL_SUFFIX);
        if (w & 7) {
            // For uneven widths, do a full 8 pixel wide filtering into a temp
            // buffer and copy out the narrow slice of pixels separately into
            // dest.
            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
-                                     tmp + (w & ~7), w & 7, h,
-                                     (1 << 7) - sgr_wt[1]);
-            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
-                                   w & 7, h);
+            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp + (w & ~7), w & 7, h,
+                                          (1 << 7) - sgr_wt[1]
+                                          HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
        }
    } else if (!dav1d_sgr_params[sgr_idx][1]) {
-        ALIGN_STK_16(coef, tmp, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
        dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges);
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges
+                               HIGHBD_TAIL_SUFFIX);
        if (w >= 8)
-            dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
-                                     tmp, w & ~7, h, sgr_wt[0]);
+            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp, w & ~7, h, sgr_wt[0]
+                                          HIGHBD_TAIL_SUFFIX);
        if (w & 7) {
            // For uneven widths, do a full 8 pixel wide filtering into a temp
            // buffer and copy out the narrow slice of pixels separately into
            // dest.
            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
-                                     tmp + (w & ~7), w & 7, h, sgr_wt[0]);
-            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
-                                   w & 7, h);
+            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp + (w & ~7), w & 7, h, sgr_wt[0]
+                                          HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
        }
    } else {
-        ALIGN_STK_16(coef, tmp1, 64 * 384,);
-        ALIGN_STK_16(coef, tmp2, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+        ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
        dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges);
+                               w, h, dav1d_sgr_params[sgr_idx][2], edges
+                               HIGHBD_TAIL_SUFFIX);
        dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges);
+                               w, h, dav1d_sgr_params[sgr_idx][3], edges
+                               HIGHBD_TAIL_SUFFIX);
        const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
        if (w >= 8)
-            dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
-                                     tmp1, tmp2, w & ~7, h, wt);
+            BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
+                                          tmp1, tmp2, w & ~7, h, wt
+                                          HIGHBD_TAIL_SUFFIX);
        if (w & 7) {
            // For uneven widths, do a full 8 pixel wide filtering into a temp
            // buffer and copy out the narrow slice of pixels separately into
            // dest.
            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
-                                     tmp1 + (w & ~7), tmp2 + (w & ~7),
-                                     w & 7, h, wt);
-            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
-                                   w & 7, h);
+            BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
+                                          dst + (w & ~7), dst_stride,
+                                          tmp1 + (w & ~7), tmp2 + (w & ~7),
+                                          w & 7, h, wt HIGHBD_TAIL_SUFFIX);
+            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
+                                        w & 7, h);
        }
    }
 }
 #endif // BITDEPTH == 8

-COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
+COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8
+#if BITDEPTH == 8 || ARCH_AARCH64
    c->wiener = wiener_filter_neon;
-    c->selfguided = sgr_filter_neon;
+    if (bpc <= 10)
+        c->selfguided = sgr_filter_neon;
 #endif
 }
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@ -30,52 +30,52 @@
 #include "src/mc.h"
 #include "src/cpu.h"

-decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_bilin_8bpc_neon);
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));

-decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));

-decl_avg_fn(dav1d_avg_8bpc_neon);
-decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
-decl_mask_fn(dav1d_mask_8bpc_neon);
-decl_blend_fn(dav1d_blend_8bpc_neon);
-decl_blend_dir_fn(dav1d_blend_h_8bpc_neon);
-decl_blend_dir_fn(dav1d_blend_v_8bpc_neon);
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));

-decl_w_mask_fn(dav1d_w_mask_444_8bpc_neon);
-decl_w_mask_fn(dav1d_w_mask_422_8bpc_neon);
-decl_w_mask_fn(dav1d_w_mask_420_8bpc_neon);
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));

-decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));

 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
-    c->mc[type] = dav1d_put_##name##_8bpc_##suffix
+    c->mc[type] = BF(dav1d_put_##name, suffix)
 #define init_mct_fn(type, name, suffix) \
-    c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
+    c->mct[type] = BF(dav1d_prep_##name, suffix)
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8
+#if BITDEPTH == 8 || ARCH_AARCH64
    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
@ -98,16 +98,16 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);

-    c->avg = dav1d_avg_8bpc_neon;
-    c->w_avg = dav1d_w_avg_8bpc_neon;
-    c->mask = dav1d_mask_8bpc_neon;
-    c->blend = dav1d_blend_8bpc_neon;
-    c->blend_h = dav1d_blend_h_8bpc_neon;
-    c->blend_v = dav1d_blend_v_8bpc_neon;
-    c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
-    c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
-    c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
-    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
-    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
+    c->avg = BF(dav1d_avg, neon);
+    c->w_avg = BF(dav1d_w_avg, neon);
+    c->mask = BF(dav1d_mask, neon);
+    c->blend = BF(dav1d_blend, neon);
+    c->blend_h = BF(dav1d_blend_h, neon);
+    c->blend_v = BF(dav1d_blend_v, neon);
+    c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+    c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+    c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+    c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+    c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
 #endif
 }
--- a/third_party/dav1d/src/cdef.h
+++ b/third_party/dav1d/src/cdef.h
@ -52,7 +52,7 @@ typedef const void *const_left_pixel_row_2px;
 // order to get access to pre-filter top pixels, use $top.
 #define decl_cdef_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
-            /*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
+            const pixel *top, int pri_strength, int sec_strength, \
            int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 typedef decl_cdef_fn(*cdef_fn);

--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@ -39,24 +39,28 @@ enum Backup2x8Flags {
    BACKUP_2X8_UV = 1 << 1,
 };

-static void backup2lines(pixel *const dst[3][2],
-                         /*const*/ pixel *const src[3],
-                         const ptrdiff_t src_stride[2], int y_off, int w,
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+                         const ptrdiff_t stride[2],
                         const enum Dav1dPixelLayout layout)
 {
-    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
-    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
+    const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+    if (y_stride < 0)
+        pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+    else
+        pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);

-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
-    w >>= ss_hor;
-    y_off >>= ss_ver;
-    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+    if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+        if (uv_stride < 0) {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+            pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+            pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+        } else {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+            pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+            pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+        }
+    }
 }

 static void backup2x8(pixel dst[3][8][2],
@ -105,7 +109,6 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
    const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
    const enum Dav1dPixelLayout layout = f->cur.p.layout;
    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
-    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

@ -114,19 +117,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
        const int by_idx = by & 30;
        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;

-        if (edges & CDEF_HAVE_BOTTOM) {
-            // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
-                         8, f->bw * 4, layout);
-        }
+        if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);

-        pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
+        ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
        edges &= ~CDEF_HAVE_LEFT;
        edges |= CDEF_HAVE_RIGHT;
        enum Backup2x8Flags prev_flag = 0;
        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
-            const int sb128x = sbx >>1;
+            const int sb128x = sbx >> 1;
            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
            if (cdef_idx == -1 ||
@ -141,6 +141,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);

+            const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
+            int y_sec_lvl = y_lvl & 3;
+            y_sec_lvl += y_sec_lvl == 3;
+            y_sec_lvl <<= bitdepth_min_8;
+
+            const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
+            int uv_sec_lvl = uv_lvl & 3;
+            uv_sec_lvl += uv_sec_lvl == 3;
+            uv_sec_lvl <<= bitdepth_min_8;
+
            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
                 bx += 2, edges |= CDEF_HAVE_LEFT)
@ -169,41 +179,32 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
                }

-                // the actual filter
-                const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
-                int y_sec_lvl = y_lvl & 3;
-                y_sec_lvl += y_sec_lvl == 3;
-                y_sec_lvl <<= bitdepth_min_8;
-                const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
-                int uv_sec_lvl = uv_lvl & 3;
-                uv_sec_lvl += uv_sec_lvl == 3;
-                uv_sec_lvl <<= bitdepth_min_8;
+                int dir;
                unsigned variance;
-                const int dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
-                                              &variance HIGHBD_CALL_SUFFIX);
-                if (y_lvl) {
+                if (y_pri_lvl || uv_pri_lvl)
+                    dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
+                                        &variance HIGHBD_CALL_SUFFIX);
+
+                if (y_pri_lvl) {
+                    const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
+                    if (adj_y_pri_lvl || y_sec_lvl)
+                        dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+                                        &f->lf.cdef_line[tf][0][bx * 4],
+                                        adj_y_pri_lvl, y_sec_lvl, dir,
+                                        damping, edges HIGHBD_CALL_SUFFIX);
+                } else if (y_sec_lvl)
                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
-                                    (pixel *const [2]) {
-                                        &f->lf.cdef_line[tf][0][0][bx * 4],
-                                        &f->lf.cdef_line[tf][0][1][bx * 4],
-                                    },
-                                    adjust_strength(y_pri_lvl, variance),
-                                    y_sec_lvl, y_pri_lvl ? dir : 0,
+                                    &f->lf.cdef_line[tf][0][bx * 4],
+                                    0, y_sec_lvl, 0,
                                    damping, edges HIGHBD_CALL_SUFFIX);
-                }
-                if (uv_lvl && has_chroma) {
-                    const int uvdir =
-                        f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
-                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
+                if (uv_lvl) {
+                    assert(layout != DAV1D_PIXEL_LAYOUT_I400);
+                    const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
+                        ((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
                    for (int pl = 1; pl <= 2; pl++) {
-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
-                                             lr_bak[bit][pl],
-                                             (pixel *const [2]) {
-                                                 &f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
-                                                 &f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
-                                             },
-                                             uv_pri_lvl, uv_sec_lvl,
-                                             uv_pri_lvl ? uvdir : 0,
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
+                                             &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
+                                             uv_pri_lvl, uv_sec_lvl, uvdir,
                                             damping - 1, edges HIGHBD_CALL_SUFFIX);
                    }
                }
--- a/third_party/dav1d/src/cdef_tmpl.c
+++ b/third_party/dav1d/src/cdef_tmpl.c
@ -32,29 +32,30 @@
 #include "common/intops.h"

 #include "src/cdef.h"
+#include "src/tables.h"

 static inline int constrain(const int diff, const int threshold,
-                            const int damping)
+                            const int shift)
 {
-    if (!threshold) return 0;
-    const int shift = imax(0, damping - ulog2(threshold));
-    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
-                      diff);
+    const int adiff = abs(diff);
+    return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
 }

-static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
+static inline void fill(int16_t *tmp, const ptrdiff_t stride,
                        const int w, const int h)
 {
+    /* Use a value that's a large positive number when interpreted as unsigned,
+     * and a large negative number when interpreted as signed. */
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++)
-            tmp[x] = INT16_MAX;
+            tmp[x] = INT16_MIN;
        tmp += stride;
    }
 }

-static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
                    const pixel *src, const ptrdiff_t src_stride,
-                    const pixel (*left)[2], pixel *const top[2],
+                    const pixel (*left)[2], const pixel *top,
                    const int w, const int h,
                    const enum CdefEdgeFlags edges)
 {
@ -77,9 +78,11 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
        x_end -= 2;
    }

-    for (int y = y_start; y < 0; y++)
+    for (int y = y_start; y < 0; y++) {
        for (int x = x_start; x < x_end; x++)
-            tmp[x + y * tmp_stride] = top[y & 1][x];
+            tmp[x + y * tmp_stride] = top[x];
+        top += PXSTRIDE(src_stride);
+    }
    for (int y = 0; y < h; y++)
        for (int x = x_start; x < 0; x++)
            tmp[x + y * tmp_stride] = left[y][2 + x];
@ -93,75 +96,113 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,

 static NOINLINE void
 cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel (*left)[2], /*const*/ pixel *const top[2],
-                    const int w, const int h, const int pri_strength,
-                    const int sec_strength, const int dir,
-                    const int damping, const enum CdefEdgeFlags edges
-                    HIGHBD_DECL_SUFFIX)
+                    const pixel (*left)[2], const pixel *const top,
+                    const int pri_strength, const int sec_strength,
+                    const int dir, const int damping, const int w, int h,
+                    const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
-    static const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
-        { -1 * 12 + 1, -2 * 12 + 2 },
-        {  0 * 12 + 1, -1 * 12 + 2 },
-        {  0 * 12 + 1,  0 * 12 + 2 },
-        {  0 * 12 + 1,  1 * 12 + 2 },
-        {  1 * 12 + 1,  2 * 12 + 2 },
-        {  1 * 12 + 0,  2 * 12 + 1 },
-        {  1 * 12 + 0,  2 * 12 + 0 },
-        {  1 * 12 + 0,  2 * 12 - 1 }
-    };
    const ptrdiff_t tmp_stride = 12;
    assert((w == 4 || w == 8) && (h == 4 || h == 8));
-    uint16_t tmp_buf[144];  // 12*12 is the maximum value of tmp_stride * (h + 4)
-    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
-    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
-    const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+    int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
+    int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;

    padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);

-    // run actual filter
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-            int sum = 0;
-            const int px = dst[x];
-            int max = px, min = px;
-            int pri_tap_k = pri_tap;
-            for (int k = 0; k < 2; k++) {
-                const int off1 = cdef_directions[dir][k];
-                const int p0 = tmp[x + off1];
-                const int p1 = tmp[x - off1];
-                sum += pri_tap_k * constrain(p0 - px, pri_strength, damping);
-                sum += pri_tap_k * constrain(p1 - px, pri_strength, damping);
-                // if pri_tap_k == 4 then it becomes 2 else it remains 3
-                pri_tap_k -= (pri_tap_k << 1) - 6;
-                if (p0 != INT16_MAX) max = imax(p0, max);
-                if (p1 != INT16_MAX) max = imax(p1, max);
-                min = imin(p0, min);
-                min = imin(p1, min);
-                const int off2 = cdef_directions[(dir + 2) & 7][k];
-                const int s0 = tmp[x + off2];
-                const int s1 = tmp[x - off2];
-                const int off3 = cdef_directions[(dir + 6) & 7][k];
-                const int s2 = tmp[x + off3];
-                const int s3 = tmp[x - off3];
-                if (s0 != INT16_MAX) max = imax(s0, max);
-                if (s1 != INT16_MAX) max = imax(s1, max);
-                if (s2 != INT16_MAX) max = imax(s2, max);
-                if (s3 != INT16_MAX) max = imax(s3, max);
-                min = imin(s0, min);
-                min = imin(s1, min);
-                min = imin(s2, min);
-                min = imin(s3, min);
-                // sec_tap starts at 2 and becomes 1
-                const int sec_tap = 2 - k;
-                sum += sec_tap * constrain(s0 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s1 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s2 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s3 - px, sec_strength, damping);
-            }
-            dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
+    if (pri_strength) {
+        const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+        const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+        const int pri_shift = imax(0, damping - ulog2(pri_strength));
+        if (sec_strength) {
+            const int sec_shift = imax(0, damping - ulog2(sec_strength));
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int max = px, min = px;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off1];
+                        const int p1 = tmp[x - off1];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        // if pri_tap_k == 4 then it becomes 2 else it remains 3
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                        min = umin(p0, min);
+                        max = imax(p0, max);
+                        min = umin(p1, min);
+                        max = imax(p1, max);
+                        const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                        const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                        const int s0 = tmp[x + off2];
+                        const int s1 = tmp[x - off2];
+                        const int s2 = tmp[x + off3];
+                        const int s3 = tmp[x - off3];
+                        // sec_tap starts at 2 and becomes 1
+                        const int sec_tap = 2 - k;
+                        sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                        min = umin(s0, min);
+                        max = imax(s0, max);
+                        min = umin(s1, min);
+                        max = imax(s1, max);
+                        min = umin(s2, min);
+                        max = imax(s2, max);
+                        min = umin(s3, min);
+                        max = imax(s3, max);
+                    }
+                    dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
+        } else { // pri_strength only
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off];
+                        const int p1 = tmp[x - off];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                    }
+                    dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
        }
-        dst += PXSTRIDE(dst_stride);
-        tmp += tmp_stride;
+    } else { // sec_strength only
+        assert(sec_strength);
+        const int sec_shift = imax(0, damping - ulog2(sec_strength));
+        do {
+            for (int x = 0; x < w; x++) {
+                const int px = dst[x];
+                int sum = 0;
+                for (int k = 0; k < 2; k++) {
+                    const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                    const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                    const int s0 = tmp[x + off1];
+                    const int s1 = tmp[x - off1];
+                    const int s2 = tmp[x + off2];
+                    const int s3 = tmp[x - off2];
+                    const int sec_tap = 2 - k;
+                    sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                }
+                dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+            }
+            dst += PXSTRIDE(dst_stride);
+            tmp += tmp_stride;
+        } while (--h);
    }
 }

@ -169,7 +210,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
 static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
                                            const ptrdiff_t stride, \
                                            const pixel (*left)[2], \
-                                            /*const*/ pixel *const top[2], \
+                                            const pixel *const top, \
                                            const int pri_strength, \
                                            const int sec_strength, \
                                            const int dir, \
@ -177,8 +218,8 @@ static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
                                            const enum CdefEdgeFlags edges \
                                            HIGHBD_DECL_SUFFIX) \
 { \
-    cdef_filter_block_c(dst, stride, left, top, w, h, pri_strength, sec_strength, \
-                        dir, damping, edges HIGHBD_TAIL_SUFFIX); \
+    cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
+                        dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
 }

 cdef_fn(4, 4);
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@ -30,24 +30,27 @@

 #include "src/cpu.h"

+static unsigned flags = 0;
+#if ARCH_X86
+/* Disable AVX-512 by default for the time being */
+static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
+#else
 static unsigned flags_mask = -1;
+#endif
+
+COLD void dav1d_init_cpu(void) {
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_PPC64LE
+    flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_X86
+    flags = dav1d_get_cpu_flags_x86();
+#endif
+#endif
+}

 COLD unsigned dav1d_get_cpu_flags(void) {
-    static unsigned flags;
-    static uint8_t checked = 0;
-
-    if (!checked) {
-#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
-        flags = dav1d_get_cpu_flags_arm();
-#elif ARCH_PPC64LE && HAVE_ASM
-        flags = dav1d_get_cpu_flags_ppc();
-#elif ARCH_X86 && HAVE_ASM
-        flags = dav1d_get_cpu_flags_x86();
-#else
-        flags = 0;
-#endif
-        checked = 1;
-    }
    return flags & flags_mask;
 }

--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@ -42,7 +42,8 @@
 #include "src/x86/cpu.h"
 #endif

+void dav1d_init_cpu(void);
 unsigned dav1d_get_cpu_flags(void);
-DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);
+DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);

 #endif /* DAV1D_SRC_CPU_H */
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -627,8 +627,8 @@ static void read_vartx_tree(Dav1dTileContext *const t,
    // var-tx tree coding
    b->tx_split[0] = b->tx_split[1] = 0;
    b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
-    if (f->frame_hdr->segmentation.lossless[b->seg_id] ||
-        b->max_ytx == TX_4X4)
+    if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
+                     b->max_ytx == TX_4X4))
    {
        b->max_ytx = b->uvtx = TX_4X4;
        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
@ -645,8 +645,6 @@ static void read_vartx_tree(Dav1dTileContext *const t,
            case_set(bh4, l., 1, by4);
            case_set(bw4, a->, 0, bx4);
 #undef set_ctx
-        } else {
-            assert(f->frame_hdr->txfm_mode == DAV1D_TX_LARGEST);
        }
        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
    } else {
@ -1878,10 +1876,11 @@ static int decode_b(Dav1dTileContext *const t,
                b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
            const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
                &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
-            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
-                                       lf_lvls, t->bx, t->by, f->w4, f->h4,
-                                       b->skip, bs, b->tx_split, b->uvtx,
-                                       f->cur.p.layout,
+            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
+                                       t->bx, t->by, f->w4, f->h4, b->skip, bs,
+                                       f->frame_hdr->segmentation.lossless[b->seg_id] ?
+                                           (enum RectTxfmSize) TX_4X4 : b->max_ytx,
+                                       b->tx_split, b->uvtx, f->cur.p.layout,
                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
@ -2350,7 +2349,7 @@ static void setup_tile(Dav1dTileState *const ts,

    // Reference Restoration Unit (used for exp coding)
    int sb_idx, unit_idx;
-    if (f->frame_hdr->super_res.enabled) {
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
        // vertical components only
        sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
        unit_idx = (ts->tiling.row_start & 16) >> 3;
@ -2363,7 +2362,7 @@ static void setup_tile(Dav1dTileState *const ts,
        if (!((f->lf.restore_planes >> p) & 1U))
            continue;

-        if (f->frame_hdr->super_res.enabled) {
+        if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
            const int d = f->frame_hdr->super_res.width_scale_denominator;
            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
@ -2543,7 +2542,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {

            const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];

-            if (f->frame_hdr->super_res.enabled) {
+            if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
                const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
                const int n_units = imax(1, (w + half_unit) >> unit_size_log2);

@ -2763,24 +2762,42 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
    }

    // update allocation of block contexts for above
-    const int line_sz = (int)f->b4_stride << hbd;
-    if (line_sz != f->lf.line_sz) {
-        dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
-        uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
+    const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+    if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
+        dav1d_free_aligned(f->lf.cdef_line_buf);
+        size_t alloc_sz = 64;
+        alloc_sz += (y_stride  < 0 ? -y_stride  : y_stride ) * 4;
+        alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
+        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
        if (!ptr) {
-            f->lf.line_sz = 0;
+            f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
            goto error;
        }

-        for (int pl = 0; pl <= 2; pl++) {
-            f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
-            f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
-            f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
-            f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
-            ptr += line_sz * 4 * 4;
+        ptr += 32;
+        if (y_stride < 0) {
+            f->lf.cdef_line[0][0] = ptr - y_stride * 1;
+            f->lf.cdef_line[1][0] = ptr - y_stride * 3;
+            ptr -= y_stride * 4;
+        } else {
+            f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+            f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+            ptr += y_stride * 4;
+        }
+        if (uv_stride < 0) {
+            f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
+            f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
+            f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
+            f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
+        } else {
+            f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+            f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+            f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+            f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
        }

-        f->lf.line_sz = line_sz;
+        f->lf.cdef_line_sz[0] = (int) y_stride;
+        f->lf.cdef_line_sz[1] = (int) uv_stride;
    }

    const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
@ -2944,14 +2961,19 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
        }
    }

-    // init loopfilter pointers
+    /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
+     * so just point the chroma pointers in 4:0:0 to the luma plane here to
+     * avoid having additional in-loop branches in various places. We never
+     * dereference those pointers so it doesn't really matter what they
+     * point at, as long as the pointers are valid. */
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
    f->lf.mask_ptr = f->lf.mask;
    f->lf.p[0] = f->cur.data[0];
-    f->lf.p[1] = f->cur.data[1];
-    f->lf.p[2] = f->cur.data[2];
+    f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
+    f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
    f->lf.sr_p[0] = f->sr_cur.p.data[0];
-    f->lf.sr_p[1] = f->sr_cur.p.data[1];
-    f->lf.sr_p[2] = f->sr_cur.p.data[2];
+    f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
+    f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
    f->lf.tile_row = 1;

    dav1d_cdf_thread_wait(&f->in_cdf);
@ -3220,7 +3242,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
-            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
+            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
            break
@ -3301,7 +3323,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
            }
            f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
                                     !f->frame_hdr->force_integer_mv &&
-                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]);
+                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
+                                     !f->svc[i][0].scale;
        }
    }

@ -3338,14 +3361,14 @@ int dav1d_submit_frame(Dav1dContext *const c) {
    res = dav1d_thread_picture_alloc(c, f, bpc);
    if (res < 0) goto error;

-    if (f->frame_hdr->super_res.enabled) {
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
        res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
        if (res < 0) goto error;
    } else {
        dav1d_picture_ref(&f->cur, &f->sr_cur.p);
    }

-    if (f->frame_hdr->super_res.enabled) {
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
        f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
        const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@ -651,8 +651,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14

 %if WIN64 == 0
    %macro WIN64_SPILL_XMM 1
+        %assign xmm_regs_used %1
    %endmacro
    %macro WIN64_RESTORE_XMM 0
+        %assign xmm_regs_used 0
    %endmacro
    %macro WIN64_PUSH_XMM 0
    %endmacro
@ -824,33 +826,34 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,

 ; cpuflags

-%assign cpuflags_mmx      (1<<0)
-%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2     (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
-%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
-%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
-%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
-%assign cpuflags_avx      (1<<14)| cpuflags_sse42
-%assign cpuflags_xop      (1<<15)| cpuflags_avx
-%assign cpuflags_fma4     (1<<16)| cpuflags_avx
-%assign cpuflags_fma3     (1<<17)| cpuflags_avx
-%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
-%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_mmx       (1<<0)
+%assign cpuflags_mmx2      (1<<1)  | cpuflags_mmx
+%assign cpuflags_3dnow     (1<<2)  | cpuflags_mmx
+%assign cpuflags_3dnowext  (1<<3)  | cpuflags_3dnow
+%assign cpuflags_sse       (1<<4)  | cpuflags_mmx2
+%assign cpuflags_sse2      (1<<5)  | cpuflags_sse
+%assign cpuflags_sse2slow  (1<<6)  | cpuflags_sse2
+%assign cpuflags_lzcnt     (1<<7)  | cpuflags_sse2
+%assign cpuflags_sse3      (1<<8)  | cpuflags_sse2
+%assign cpuflags_ssse3     (1<<9)  | cpuflags_sse3
+%assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
+%assign cpuflags_sse42     (1<<11) | cpuflags_sse4
+%assign cpuflags_aesni     (1<<12) | cpuflags_sse42
+%assign cpuflags_gfni      (1<<13) | cpuflags_sse42
+%assign cpuflags_avx       (1<<14) | cpuflags_sse42
+%assign cpuflags_xop       (1<<15) | cpuflags_avx
+%assign cpuflags_fma4      (1<<16) | cpuflags_avx
+%assign cpuflags_fma3      (1<<17) | cpuflags_avx
+%assign cpuflags_bmi1      (1<<18) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2      (1<<19) | cpuflags_bmi1
+%assign cpuflags_avx2      (1<<20) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512    (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ

-%assign cpuflags_cache32  (1<<22)
-%assign cpuflags_cache64  (1<<23)
-%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<25)
+%assign cpuflags_cache32   (1<<23)
+%assign cpuflags_cache64   (1<<24)
+%assign cpuflags_aligned   (1<<25) ; not a cpu feature, but a function variant
+%assign cpuflags_atom      (1<<26)

 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@ -122,17 +122,32 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
    // TODO: eliminate in favor of per-plane refs
    assert(out->stride[0] == in->stride[0]);
    if (!data->num_y_points) {
-        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
+        const ptrdiff_t stride = out->stride[0];
+        const ptrdiff_t sz = out->p.h * stride;
+        if (sz < 0)
+            memcpy((uint8_t*) out->data[0] + sz - stride,
+                   (uint8_t*) in->data[0] + sz - stride, -sz);
+        else
+            memcpy(out->data[0], in->data[0], sz);
    }

-    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
        assert(out->stride[1] == in->stride[1]);
-        for (int i = 0; i < 2; i++) {
-            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
-                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-                memcpy(out->data[1+i], in->data[1+i],
-                       (out->p.h >> suby) * out->stride[1]);
-            }
+        const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const ptrdiff_t stride = out->stride[1];
+        const ptrdiff_t sz = (out->p.h * stride) >> ss_ver;
+        if (sz < 0) {
+            if (!data->num_uv_points[0])
+                memcpy((uint8_t*) out->data[1] + sz - stride,
+                       (uint8_t*) in->data[1] + sz - stride, -sz);
+            if (!data->num_uv_points[1])
+                memcpy((uint8_t*) out->data[2] + sz - stride,
+                       (uint8_t*) in->data[2] + sz - stride, -sz);
+        } else {
+            if (!data->num_uv_points[0])
+                memcpy(out->data[1], in->data[1], sz);
+            if (!data->num_uv_points[1])
+                memcpy(out->data[2], in->data[2], sz);
        }
    }

--- a/third_party/dav1d/src/film_grain_tmpl.c
+++ b/third_party/dav1d/src/film_grain_tmpl.c
@ -43,7 +43,7 @@ static inline int get_random_number(const int bits, unsigned *const state) {
    return (*state >> (16 - bits)) & ((1 << bits) - 1);
 }

-static inline int round2(const int x, const int shift) {
+static inline int round2(const int x, const uint64_t shift) {
    return (x + ((1 << shift) >> 1)) >> shift;
 }

--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -216,12 +216,14 @@ struct Dav1dFrameContext {
        Av1Filter *mask;
        Av1Restoration *lr_mask;
        int top_pre_cdef_toggle;
-        int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
+        int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
+        int lr_line_sz, re_sz /* h */;
        ALIGN(Av1FilterLUT lim_lut, 16);
        int last_sharpness;
        uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
        uint8_t *tx_lpf_right_edge[2];
-        pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
+        uint8_t *cdef_line_buf;
+        pixel *cdef_line[2 /* pre, post */][3 /* plane */];
        pixel *lr_lpf_line[3 /* plane */];

        // in-loop filter per-frame state keeping
@ -288,7 +290,7 @@ struct Dav1dTileContext {
    uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
    uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
    uint8_t txtp_map[32 * 32]; // inter-only
-    ALIGN(union, 32) {
+    ALIGN(union, 64) {
        struct {
            union {
                uint8_t  lap_8bpc [128 * 32];
--- a/third_party/dav1d/src/ipred_prepare.h
+++ b/third_party/dav1d/src/ipred_prepare.h
@ -66,7 +66,7 @@
 * range, in the following order:
 * - [0] will be the top/left edge pixel;
 * - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
- * - [w+1..w*w] will be the top/right edge pixels;
+ * - [w+1..2*w] will be the top/right edge pixels;
 * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
 *   most);
 * - [-w-1..-2*w] will be the bottom/left edge pixels.
--- a/third_party/dav1d/src/itx_1d.c
+++ b/third_party/dav1d/src/itx_1d.c
@ -1,6 +1,6 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -30,7 +30,9 @@
 #include <stddef.h>
 #include <stdint.h>

-#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx_1d.h"

 #define CLIP(a) iclip(a, min, max)

@ -60,41 +62,62 @@
 * wrap around.
 */

-static void NOINLINE
-inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+static NOINLINE void
+inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                       const int min, const int max, const int tx64)
 {
-    const int min = -max - 1;
-    const int in0 = in[0 * in_s], in1 = in[1 * in_s];
-    const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+    assert(stride > 0);
+    const int in0 = c[0 * stride], in1 = c[1 * stride];

-    int t0 = ((in0 + in2) * 181 + 128) >> 8;
-    int t1 = ((in0 - in2) * 181 + 128) >> 8;
-    int t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
-    int t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;
+    int t0, t1, t2, t3;
+    if (tx64) {
+        t0 = t1 = (in0 * 181 + 128) >> 8;
+        t2 = (in1 * 1567 + 2048) >> 12;
+        t3 = (in1 * 3784 + 2048) >> 12;
+    } else {
+        const int in2 = c[2 * stride], in3 = c[3 * stride];

-    out[0 * out_s] = CLIP(t0 + t3);
-    out[1 * out_s] = CLIP(t1 + t2);
-    out[2 * out_s] = CLIP(t1 - t2);
-    out[3 * out_s] = CLIP(t0 - t3);
+        t0 = ((in0 + in2) * 181 + 128) >> 8;
+        t1 = ((in0 - in2) * 181 + 128) >> 8;
+        t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+        t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;
+    }
+
+    c[0 * stride] = CLIP(t0 + t3);
+    c[1 * stride] = CLIP(t1 + t2);
+    c[2 * stride] = CLIP(t1 - t2);
+    c[3 * stride] = CLIP(t0 - t3);
 }

-static void NOINLINE
-inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                         const int min, const int max)
 {
-    const int min = -max - 1;
-    coef tmp[4];
+    inv_dct4_1d_internal_c(c, stride, min, max, 0);
+}

-    inv_dct4_1d(in, in_s * 2, tmp, 1, max);
+static NOINLINE void
+inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                       const int min, const int max, const int tx64)
+{
+    assert(stride > 0);
+    inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1 = in[1 * in_s], in3 = in[3 * in_s];
-    const int in5 = in[5 * in_s], in7 = in[7 * in_s];
+    const int in1 = c[1 * stride], in3 = c[3 * stride];

-    int t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;
-    int t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;
-    int t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;
-    int t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
+    int t4a, t5a, t6a, t7a;
+    if (tx64) {
+        t4a = (in1 *   799 + 2048) >> 12;
+        t5a = (in3 * -2276 + 2048) >> 12;
+        t6a = (in3 *  3406 + 2048) >> 12;
+        t7a = (in1 *  4017 + 2048) >> 12;
+    } else {
+        const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+        t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;
+        t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;
+        t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;
+        t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
+    }

    int t4  = CLIP(t4a + t5a);
        t5a = CLIP(t4a - t5a);
@ -104,38 +127,60 @@ inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
    int t5  = ((t6a - t5a) * 181 + 128) >> 8;
    int t6  = ((t6a + t5a) * 181 + 128) >> 8;

-    out[0 * out_s] = CLIP(tmp[0] + t7);
-    out[1 * out_s] = CLIP(tmp[1] + t6);
-    out[2 * out_s] = CLIP(tmp[2] + t5);
-    out[3 * out_s] = CLIP(tmp[3] + t4);
-    out[4 * out_s] = CLIP(tmp[3] - t4);
-    out[5 * out_s] = CLIP(tmp[2] - t5);
-    out[6 * out_s] = CLIP(tmp[1] - t6);
-    out[7 * out_s] = CLIP(tmp[0] - t7);
+    const int t0 = c[0 * stride];
+    const int t1 = c[2 * stride];
+    const int t2 = c[4 * stride];
+    const int t3 = c[6 * stride];
+
+    c[0 * stride] = CLIP(t0 + t7);
+    c[1 * stride] = CLIP(t1 + t6);
+    c[2 * stride] = CLIP(t2 + t5);
+    c[3 * stride] = CLIP(t3 + t4);
+    c[4 * stride] = CLIP(t3 - t4);
+    c[5 * stride] = CLIP(t2 - t5);
+    c[6 * stride] = CLIP(t1 - t6);
+    c[7 * stride] = CLIP(t0 - t7);
 }

-static void NOINLINE
-inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                         const int min, const int max)
 {
-    const int min = -max - 1;
-    coef tmp[8];
+    inv_dct8_1d_internal_c(c, stride, min, max, 0);
+}

-    inv_dct8_1d(in, in_s * 2, tmp, 1, max);
+static NOINLINE void
+inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                        const int min, const int max, int tx64)
+{
+    assert(stride > 0);
+    inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];
+    const int in1 = c[1 * stride], in3 = c[3 * stride];
+    const int in5 = c[5 * stride], in7 = c[7 * stride];

-    int t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;
-    int t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;
-    int t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;
-    int t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;
-    int t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;
-    int t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;
-    int t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;
-    int t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;
+    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+    if (tx64) {
+        t8a  = (in1 *   401 + 2048) >> 12;
+        t9a  = (in7 * -2598 + 2048) >> 12;
+        t10a = (in5 *  1931 + 2048) >> 12;
+        t11a = (in3 * -1189 + 2048) >> 12;
+        t12a = (in3 *  3920 + 2048) >> 12;
+        t13a = (in5 *  3612 + 2048) >> 12;
+        t14a = (in7 *  3166 + 2048) >> 12;
+        t15a = (in1 *  4076 + 2048) >> 12;
+    } else {
+        const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+        const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+        t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;
+        t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;
+        t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;
+        t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;
+        t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;
+        t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;
+        t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;
+        t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;
+    }

    int t8  = CLIP(t8a  + t9a);
    int t9  = CLIP(t8a  - t9a);
@ -165,58 +210,93 @@ inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
    t11  = ((t12a - t11a) * 181 + 128) >> 8;
    t12  = ((t12a + t11a) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[0] + t15a);
-    out[ 1 * out_s] = CLIP(tmp[1] + t14);
-    out[ 2 * out_s] = CLIP(tmp[2] + t13a);
-    out[ 3 * out_s] = CLIP(tmp[3] + t12);
-    out[ 4 * out_s] = CLIP(tmp[4] + t11);
-    out[ 5 * out_s] = CLIP(tmp[5] + t10a);
-    out[ 6 * out_s] = CLIP(tmp[6] + t9);
-    out[ 7 * out_s] = CLIP(tmp[7] + t8a);
-    out[ 8 * out_s] = CLIP(tmp[7] - t8a);
-    out[ 9 * out_s] = CLIP(tmp[6] - t9);
-    out[10 * out_s] = CLIP(tmp[5] - t10a);
-    out[11 * out_s] = CLIP(tmp[4] - t11);
-    out[12 * out_s] = CLIP(tmp[3] - t12);
-    out[13 * out_s] = CLIP(tmp[2] - t13a);
-    out[14 * out_s] = CLIP(tmp[1] - t14);
-    out[15 * out_s] = CLIP(tmp[0] - t15a);
+    const int t0 = c[ 0 * stride];
+    const int t1 = c[ 2 * stride];
+    const int t2 = c[ 4 * stride];
+    const int t3 = c[ 6 * stride];
+    const int t4 = c[ 8 * stride];
+    const int t5 = c[10 * stride];
+    const int t6 = c[12 * stride];
+    const int t7 = c[14 * stride];
+
+    c[ 0 * stride] = CLIP(t0 + t15a);
+    c[ 1 * stride] = CLIP(t1 + t14);
+    c[ 2 * stride] = CLIP(t2 + t13a);
+    c[ 3 * stride] = CLIP(t3 + t12);
+    c[ 4 * stride] = CLIP(t4 + t11);
+    c[ 5 * stride] = CLIP(t5 + t10a);
+    c[ 6 * stride] = CLIP(t6 + t9);
+    c[ 7 * stride] = CLIP(t7 + t8a);
+    c[ 8 * stride] = CLIP(t7 - t8a);
+    c[ 9 * stride] = CLIP(t6 - t9);
+    c[10 * stride] = CLIP(t5 - t10a);
+    c[11 * stride] = CLIP(t4 - t11);
+    c[12 * stride] = CLIP(t3 - t12);
+    c[13 * stride] = CLIP(t2 - t13a);
+    c[14 * stride] = CLIP(t1 - t14);
+    c[15 * stride] = CLIP(t0 - t15a);
 }

-static void NOINLINE
-inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
-    const int min = -max - 1;
-    coef tmp[16];
+    inv_dct16_1d_internal_c(c, stride, min, max, 0);
+}

-    inv_dct16_1d(in, in_s * 2, tmp, 1, max);
+static NOINLINE void
+inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+                        const int min, const int max, const int tx64)
+{
+    assert(stride > 0);
+    inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];
-    const int in17 = in[17 * in_s], in19 = in[19 * in_s];
-    const int in21 = in[21 * in_s], in23 = in[23 * in_s];
-    const int in25 = in[25 * in_s], in27 = in[27 * in_s];
-    const int in29 = in[29 * in_s], in31 = in[31 * in_s];
+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+    const int in13 = c[13 * stride], in15 = c[15 * stride];

-    int t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;
-    int t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;
-    int t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;
-    int t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;
-    int t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;
-    int t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;
-    int t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;
-    int t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;
-    int t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;
-    int t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;
-    int t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;
-    int t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;
-    int t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;
-    int t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;
-    int t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;
-    int t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;
+    int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
+    int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
+    if (tx64) {
+        t16a = (in1  *   201 + 2048) >> 12;
+        t17a = (in15 * -2751 + 2048) >> 12;
+        t18a = (in9  *  1751 + 2048) >> 12;
+        t19a = (in7  * -1380 + 2048) >> 12;
+        t20a = (in5  *   995 + 2048) >> 12;
+        t21a = (in11 * -2106 + 2048) >> 12;
+        t22a = (in13 *  2440 + 2048) >> 12;
+        t23a = (in3  *  -601 + 2048) >> 12;
+        t24a = (in3  *  4052 + 2048) >> 12;
+        t25a = (in13 *  3290 + 2048) >> 12;
+        t26a = (in11 *  3513 + 2048) >> 12;
+        t27a = (in5  *  3973 + 2048) >> 12;
+        t28a = (in7  *  3857 + 2048) >> 12;
+        t29a = (in9  *  3703 + 2048) >> 12;
+        t30a = (in15 *  3035 + 2048) >> 12;
+        t31a = (in1  *  4091 + 2048) >> 12;
+    } else {
+        const int in17 = c[17 * stride], in19 = c[19 * stride];
+        const int in21 = c[21 * stride], in23 = c[23 * stride];
+        const int in25 = c[25 * stride], in27 = c[27 * stride];
+        const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+        t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;
+        t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;
+        t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;
+        t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;
+        t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;
+        t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;
+        t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;
+        t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;
+        t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;
+        t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;
+        t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;
+        t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;
+        t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;
+        t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;
+        t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;
+        t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;
+    }

    int t16 = CLIP(t16a + t17a);
    int t17 = CLIP(t16a - t17a);
@ -296,98 +376,110 @@ inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
    t23a = ((t24  - t23 ) * 181 + 128) >> 8;
    t24a = ((t24  + t23 ) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[ 0] + t31);
-    out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);
-    out[ 2 * out_s] = CLIP(tmp[ 2] + t29);
-    out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);
-    out[ 4 * out_s] = CLIP(tmp[ 4] + t27);
-    out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);
-    out[ 6 * out_s] = CLIP(tmp[ 6] + t25);
-    out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);
-    out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);
-    out[ 9 * out_s] = CLIP(tmp[ 9] + t22);
-    out[10 * out_s] = CLIP(tmp[10] + t21a);
-    out[11 * out_s] = CLIP(tmp[11] + t20);
-    out[12 * out_s] = CLIP(tmp[12] + t19a);
-    out[13 * out_s] = CLIP(tmp[13] + t18);
-    out[14 * out_s] = CLIP(tmp[14] + t17a);
-    out[15 * out_s] = CLIP(tmp[15] + t16);
-    out[16 * out_s] = CLIP(tmp[15] - t16);
-    out[17 * out_s] = CLIP(tmp[14] - t17a);
-    out[18 * out_s] = CLIP(tmp[13] - t18);
-    out[19 * out_s] = CLIP(tmp[12] - t19a);
-    out[20 * out_s] = CLIP(tmp[11] - t20);
-    out[21 * out_s] = CLIP(tmp[10] - t21a);
-    out[22 * out_s] = CLIP(tmp[ 9] - t22);
-    out[23 * out_s] = CLIP(tmp[ 8] - t23a);
-    out[24 * out_s] = CLIP(tmp[ 7] - t24a);
-    out[25 * out_s] = CLIP(tmp[ 6] - t25);
-    out[26 * out_s] = CLIP(tmp[ 5] - t26a);
-    out[27 * out_s] = CLIP(tmp[ 4] - t27);
-    out[28 * out_s] = CLIP(tmp[ 3] - t28a);
-    out[29 * out_s] = CLIP(tmp[ 2] - t29);
-    out[30 * out_s] = CLIP(tmp[ 1] - t30a);
-    out[31 * out_s] = CLIP(tmp[ 0] - t31);
+    const int t0  = c[ 0 * stride];
+    const int t1  = c[ 2 * stride];
+    const int t2  = c[ 4 * stride];
+    const int t3  = c[ 6 * stride];
+    const int t4  = c[ 8 * stride];
+    const int t5  = c[10 * stride];
+    const int t6  = c[12 * stride];
+    const int t7  = c[14 * stride];
+    const int t8  = c[16 * stride];
+    const int t9  = c[18 * stride];
+    const int t10 = c[20 * stride];
+    const int t11 = c[22 * stride];
+    const int t12 = c[24 * stride];
+    const int t13 = c[26 * stride];
+    const int t14 = c[28 * stride];
+    const int t15 = c[30 * stride];
+
+    c[ 0 * stride] = CLIP(t0  + t31);
+    c[ 1 * stride] = CLIP(t1  + t30a);
+    c[ 2 * stride] = CLIP(t2  + t29);
+    c[ 3 * stride] = CLIP(t3  + t28a);
+    c[ 4 * stride] = CLIP(t4  + t27);
+    c[ 5 * stride] = CLIP(t5  + t26a);
+    c[ 6 * stride] = CLIP(t6  + t25);
+    c[ 7 * stride] = CLIP(t7  + t24a);
+    c[ 8 * stride] = CLIP(t8  + t23a);
+    c[ 9 * stride] = CLIP(t9  + t22);
+    c[10 * stride] = CLIP(t10 + t21a);
+    c[11 * stride] = CLIP(t11 + t20);
+    c[12 * stride] = CLIP(t12 + t19a);
+    c[13 * stride] = CLIP(t13 + t18);
+    c[14 * stride] = CLIP(t14 + t17a);
+    c[15 * stride] = CLIP(t15 + t16);
+    c[16 * stride] = CLIP(t15 - t16);
+    c[17 * stride] = CLIP(t14 - t17a);
+    c[18 * stride] = CLIP(t13 - t18);
+    c[19 * stride] = CLIP(t12 - t19a);
+    c[20 * stride] = CLIP(t11 - t20);
+    c[21 * stride] = CLIP(t10 - t21a);
+    c[22 * stride] = CLIP(t9  - t22);
+    c[23 * stride] = CLIP(t8  - t23a);
+    c[24 * stride] = CLIP(t7  - t24a);
+    c[25 * stride] = CLIP(t6  - t25);
+    c[26 * stride] = CLIP(t5  - t26a);
+    c[27 * stride] = CLIP(t4  - t27);
+    c[28 * stride] = CLIP(t3  - t28a);
+    c[29 * stride] = CLIP(t2  - t29);
+    c[30 * stride] = CLIP(t1  - t30a);
+    c[31 * stride] = CLIP(t0  - t31);
 }

-static void NOINLINE
-inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
-    const int min = -max - 1;
-    coef tmp[32];
+    inv_dct32_1d_internal_c(c, stride, min, max, 0);
+}

-    inv_dct32_1d(in, in_s * 2, tmp, 1, max);
+void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
+{
+    assert(stride > 0);
+    inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];
-    const int in17 = in[17 * in_s], in19 = in[19 * in_s];
-    const int in21 = in[21 * in_s], in23 = in[23 * in_s];
-    const int in25 = in[25 * in_s], in27 = in[27 * in_s];
-    const int in29 = in[29 * in_s], in31 = in[31 * in_s];
-    const int in33 = in[33 * in_s], in35 = in[35 * in_s];
-    const int in37 = in[37 * in_s], in39 = in[39 * in_s];
-    const int in41 = in[41 * in_s], in43 = in[43 * in_s];
-    const int in45 = in[45 * in_s], in47 = in[47 * in_s];
-    const int in49 = in[49 * in_s], in51 = in[51 * in_s];
-    const int in53 = in[53 * in_s], in55 = in[55 * in_s];
-    const int in57 = in[57 * in_s], in59 = in[59 * in_s];
-    const int in61 = in[61 * in_s], in63 = in[63 * in_s];
+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
+    const int in13 = c[13 * stride], in15 = c[15 * stride];
+    const int in17 = c[17 * stride], in19 = c[19 * stride];
+    const int in21 = c[21 * stride], in23 = c[23 * stride];
+    const int in25 = c[25 * stride], in27 = c[27 * stride];
+    const int in29 = c[29 * stride], in31 = c[31 * stride];

-    int t32a = ((in1  *   101         - in63 * (4095 - 4096) + 2048) >> 12) - in63;
-    int t33a = ((in33 * (2967 - 4096) - in31 *  2824         + 2048) >> 12) + in33;
-    int t34a = ((in17 *  1660         - in47 * (3745 - 4096) + 2048) >> 12) - in47;
-    int t35a =  (in49 *  1911         - in15 *   737         + 1024) >> 11;
-    int t36a = ((in9  *   897         - in55 * (3996 - 4096) + 2048) >> 12) - in55;
-    int t37a = ((in41 * (3461 - 4096) - in23 *  2191         + 2048) >> 12) + in41;
-    int t38a = ((in25 *  2359         - in39 * (3349 - 4096) + 2048) >> 12) - in39;
-    int t39a =  (in57 *  2018         - in7  *   350         + 1024) >> 11;
-    int t40a = ((in5  *   501         - in59 * (4065 - 4096) + 2048) >> 12) - in59;
-    int t41a = ((in37 * (3229 - 4096) - in27 *  2520         + 2048) >> 12) + in37;
-    int t42a = ((in21 *  2019         - in43 * (3564 - 4096) + 2048) >> 12) - in43;
-    int t43a =  (in53 *  1974         - in11 *   546         + 1024) >> 11;
-    int t44a = ((in13 *  1285         - in51 * (3889 - 4096) + 2048) >> 12) - in51;
-    int t45a = ((in45 * (3659 - 4096) - in19 *  1842         + 2048) >> 12) + in45;
-    int t46a = ((in29 *  2675         - in35 * (3102 - 4096) + 2048) >> 12) - in35;
-    int t47a = ((in61 * (4085 - 4096) - in3  *   301         + 2048) >> 12) + in61;
-    int t48a = ((in61 *   301         + in3  * (4085 - 4096) + 2048) >> 12) + in3;
-    int t49a = ((in29 * (3102 - 4096) + in35 *  2675         + 2048) >> 12) + in29;
-    int t50a = ((in45 *  1842         + in19 * (3659 - 4096) + 2048) >> 12) + in19;
-    int t51a = ((in13 * (3889 - 4096) + in51 *  1285         + 2048) >> 12) + in13;
-    int t52a =  (in53 *   546         + in11 *  1974         + 1024) >> 11;
-    int t53a = ((in21 * (3564 - 4096) + in43 *  2019         + 2048) >> 12) + in21;
-    int t54a = ((in37 *  2520         + in27 * (3229 - 4096) + 2048) >> 12) + in27;
-    int t55a = ((in5  * (4065 - 4096) + in59 *   501         + 2048) >> 12) + in5;
-    int t56a =  (in57 *   350         + in7  *  2018         + 1024) >> 11;
-    int t57a = ((in25 * (3349 - 4096) + in39 *  2359         + 2048) >> 12) + in25;
-    int t58a = ((in41 *  2191         + in23 * (3461 - 4096) + 2048) >> 12) + in23;
-    int t59a = ((in9  * (3996 - 4096) + in55 *   897         + 2048) >> 12) + in9;
-    int t60a =  (in49 *   737         + in15 *  1911         + 1024) >> 11;
-    int t61a = ((in17 * (3745 - 4096) + in47 *  1660         + 2048) >> 12) + in17;
-    int t62a = ((in33 *  2824         + in31 * (2967 - 4096) + 2048) >> 12) + in31;
-    int t63a = ((in1  * (4095 - 4096) + in63 *   101         + 2048) >> 12) + in1;
+    int t32a = (in1  *   101 + 2048) >> 12;
+    int t33a = (in31 * -2824 + 2048) >> 12;
+    int t34a = (in17 *  1660 + 2048) >> 12;
+    int t35a = (in15 * -1474 + 2048) >> 12;
+    int t36a = (in9  *   897 + 2048) >> 12;
+    int t37a = (in23 * -2191 + 2048) >> 12;
+    int t38a = (in25 *  2359 + 2048) >> 12;
+    int t39a = (in7  *  -700 + 2048) >> 12;
+    int t40a = (in5  *   501 + 2048) >> 12;
+    int t41a = (in27 * -2520 + 2048) >> 12;
+    int t42a = (in21 *  2019 + 2048) >> 12;
+    int t43a = (in11 * -1092 + 2048) >> 12;
+    int t44a = (in13 *  1285 + 2048) >> 12;
+    int t45a = (in19 * -1842 + 2048) >> 12;
+    int t46a = (in29 *  2675 + 2048) >> 12;
+    int t47a = (in3  *  -301 + 2048) >> 12;
+    int t48a = (in3  *  4085 + 2048) >> 12;
+    int t49a = (in29 *  3102 + 2048) >> 12;
+    int t50a = (in19 *  3659 + 2048) >> 12;
+    int t51a = (in13 *  3889 + 2048) >> 12;
+    int t52a = (in11 *  3948 + 2048) >> 12;
+    int t53a = (in21 *  3564 + 2048) >> 12;
+    int t54a = (in27 *  3229 + 2048) >> 12;
+    int t55a = (in5  *  4065 + 2048) >> 12;
+    int t56a = (in7  *  4036 + 2048) >> 12;
+    int t57a = (in25 *  3349 + 2048) >> 12;
+    int t58a = (in23 *  3461 + 2048) >> 12;
+    int t59a = (in9  *  3996 + 2048) >> 12;
+    int t60a = (in15 *  3822 + 2048) >> 12;
+    int t61a = (in17 *  3745 + 2048) >> 12;
+    int t62a = (in31 *  2967 + 2048) >> 12;
+    int t63a = (in1  *  4095 + 2048) >> 12;

    int t32 = CLIP(t32a + t33a);
    int t33 = CLIP(t32a - t33a);
@ -589,76 +681,111 @@ inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
    t54  = ((t41a + t54a) * 181 + 128) >> 8;
    t55a = ((t40  + t55 ) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
-    out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
-    out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);
-    out[ 3 * out_s] = CLIP(tmp[ 3] + t60);
-    out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);
-    out[ 5 * out_s] = CLIP(tmp[ 5] + t58);
-    out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);
-    out[ 7 * out_s] = CLIP(tmp[ 7] + t56);
-    out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);
-    out[ 9 * out_s] = CLIP(tmp[ 9] + t54);
-    out[10 * out_s] = CLIP(tmp[10] + t53a);
-    out[11 * out_s] = CLIP(tmp[11] + t52);
-    out[12 * out_s] = CLIP(tmp[12] + t51a);
-    out[13 * out_s] = CLIP(tmp[13] + t50);
-    out[14 * out_s] = CLIP(tmp[14] + t49a);
-    out[15 * out_s] = CLIP(tmp[15] + t48);
-    out[16 * out_s] = CLIP(tmp[16] + t47);
-    out[17 * out_s] = CLIP(tmp[17] + t46a);
-    out[18 * out_s] = CLIP(tmp[18] + t45);
-    out[19 * out_s] = CLIP(tmp[19] + t44a);
-    out[20 * out_s] = CLIP(tmp[20] + t43);
-    out[21 * out_s] = CLIP(tmp[21] + t42a);
-    out[22 * out_s] = CLIP(tmp[22] + t41);
-    out[23 * out_s] = CLIP(tmp[23] + t40a);
-    out[24 * out_s] = CLIP(tmp[24] + t39);
-    out[25 * out_s] = CLIP(tmp[25] + t38a);
-    out[26 * out_s] = CLIP(tmp[26] + t37);
-    out[27 * out_s] = CLIP(tmp[27] + t36a);
-    out[28 * out_s] = CLIP(tmp[28] + t35);
-    out[29 * out_s] = CLIP(tmp[29] + t34a);
-    out[30 * out_s] = CLIP(tmp[30] + t33);
-    out[31 * out_s] = CLIP(tmp[31] + t32a);
-    out[32 * out_s] = CLIP(tmp[31] - t32a);
-    out[33 * out_s] = CLIP(tmp[30] - t33);
-    out[34 * out_s] = CLIP(tmp[29] - t34a);
-    out[35 * out_s] = CLIP(tmp[28] - t35);
-    out[36 * out_s] = CLIP(tmp[27] - t36a);
-    out[37 * out_s] = CLIP(tmp[26] - t37);
-    out[38 * out_s] = CLIP(tmp[25] - t38a);
-    out[39 * out_s] = CLIP(tmp[24] - t39);
-    out[40 * out_s] = CLIP(tmp[23] - t40a);
-    out[41 * out_s] = CLIP(tmp[22] - t41);
-    out[42 * out_s] = CLIP(tmp[21] - t42a);
-    out[43 * out_s] = CLIP(tmp[20] - t43);
-    out[44 * out_s] = CLIP(tmp[19] - t44a);
-    out[45 * out_s] = CLIP(tmp[18] - t45);
-    out[46 * out_s] = CLIP(tmp[17] - t46a);
-    out[47 * out_s] = CLIP(tmp[16] - t47);
-    out[48 * out_s] = CLIP(tmp[15] - t48);
-    out[49 * out_s] = CLIP(tmp[14] - t49a);
-    out[50 * out_s] = CLIP(tmp[13] - t50);
-    out[51 * out_s] = CLIP(tmp[12] - t51a);
-    out[52 * out_s] = CLIP(tmp[11] - t52);
-    out[53 * out_s] = CLIP(tmp[10] - t53a);
-    out[54 * out_s] = CLIP(tmp[ 9] - t54);
-    out[55 * out_s] = CLIP(tmp[ 8] - t55a);
-    out[56 * out_s] = CLIP(tmp[ 7] - t56);
-    out[57 * out_s] = CLIP(tmp[ 6] - t57a);
-    out[58 * out_s] = CLIP(tmp[ 5] - t58);
-    out[59 * out_s] = CLIP(tmp[ 4] - t59a);
-    out[60 * out_s] = CLIP(tmp[ 3] - t60);
-    out[61 * out_s] = CLIP(tmp[ 2] - t61a);
-    out[62 * out_s] = CLIP(tmp[ 1] - t62);
-    out[63 * out_s] = CLIP(tmp[ 0] - t63a);
+    const int t0  = c[ 0 * stride];
+    const int t1  = c[ 2 * stride];
+    const int t2  = c[ 4 * stride];
+    const int t3  = c[ 6 * stride];
+    const int t4  = c[ 8 * stride];
+    const int t5  = c[10 * stride];
+    const int t6  = c[12 * stride];
+    const int t7  = c[14 * stride];
+    const int t8  = c[16 * stride];
+    const int t9  = c[18 * stride];
+    const int t10 = c[20 * stride];
+    const int t11 = c[22 * stride];
+    const int t12 = c[24 * stride];
+    const int t13 = c[26 * stride];
+    const int t14 = c[28 * stride];
+    const int t15 = c[30 * stride];
+    const int t16 = c[32 * stride];
+    const int t17 = c[34 * stride];
+    const int t18 = c[36 * stride];
+    const int t19 = c[38 * stride];
+    const int t20 = c[40 * stride];
+    const int t21 = c[42 * stride];
+    const int t22 = c[44 * stride];
+    const int t23 = c[46 * stride];
+    const int t24 = c[48 * stride];
+    const int t25 = c[50 * stride];
+    const int t26 = c[52 * stride];
+    const int t27 = c[54 * stride];
+    const int t28 = c[56 * stride];
+    const int t29 = c[58 * stride];
+    const int t30 = c[60 * stride];
+    const int t31 = c[62 * stride];
+
+    c[ 0 * stride] = CLIP(t0  + t63a);
+    c[ 1 * stride] = CLIP(t1  + t62);
+    c[ 2 * stride] = CLIP(t2  + t61a);
+    c[ 3 * stride] = CLIP(t3  + t60);
+    c[ 4 * stride] = CLIP(t4  + t59a);
+    c[ 5 * stride] = CLIP(t5  + t58);
+    c[ 6 * stride] = CLIP(t6  + t57a);
+    c[ 7 * stride] = CLIP(t7  + t56);
+    c[ 8 * stride] = CLIP(t8  + t55a);
+    c[ 9 * stride] = CLIP(t9  + t54);
+    c[10 * stride] = CLIP(t10 + t53a);
+    c[11 * stride] = CLIP(t11 + t52);
+    c[12 * stride] = CLIP(t12 + t51a);
+    c[13 * stride] = CLIP(t13 + t50);
+    c[14 * stride] = CLIP(t14 + t49a);
+    c[15 * stride] = CLIP(t15 + t48);
+    c[16 * stride] = CLIP(t16 + t47);
+    c[17 * stride] = CLIP(t17 + t46a);
+    c[18 * stride] = CLIP(t18 + t45);
+    c[19 * stride] = CLIP(t19 + t44a);
+    c[20 * stride] = CLIP(t20 + t43);
+    c[21 * stride] = CLIP(t21 + t42a);
+    c[22 * stride] = CLIP(t22 + t41);
+    c[23 * stride] = CLIP(t23 + t40a);
+    c[24 * stride] = CLIP(t24 + t39);
+    c[25 * stride] = CLIP(t25 + t38a);
+    c[26 * stride] = CLIP(t26 + t37);
+    c[27 * stride] = CLIP(t27 + t36a);
+    c[28 * stride] = CLIP(t28 + t35);
+    c[29 * stride] = CLIP(t29 + t34a);
+    c[30 * stride] = CLIP(t30 + t33);
+    c[31 * stride] = CLIP(t31 + t32a);
+    c[32 * stride] = CLIP(t31 - t32a);
+    c[33 * stride] = CLIP(t30 - t33);
+    c[34 * stride] = CLIP(t29 - t34a);
+    c[35 * stride] = CLIP(t28 - t35);
+    c[36 * stride] = CLIP(t27 - t36a);
+    c[37 * stride] = CLIP(t26 - t37);
+    c[38 * stride] = CLIP(t25 - t38a);
+    c[39 * stride] = CLIP(t24 - t39);
+    c[40 * stride] = CLIP(t23 - t40a);
+    c[41 * stride] = CLIP(t22 - t41);
+    c[42 * stride] = CLIP(t21 - t42a);
+    c[43 * stride] = CLIP(t20 - t43);
+    c[44 * stride] = CLIP(t19 - t44a);
+    c[45 * stride] = CLIP(t18 - t45);
+    c[46 * stride] = CLIP(t17 - t46a);
+    c[47 * stride] = CLIP(t16 - t47);
+    c[48 * stride] = CLIP(t15 - t48);
+    c[49 * stride] = CLIP(t14 - t49a);
+    c[50 * stride] = CLIP(t13 - t50);
+    c[51 * stride] = CLIP(t12 - t51a);
+    c[52 * stride] = CLIP(t11 - t52);
+    c[53 * stride] = CLIP(t10 - t53a);
+    c[54 * stride] = CLIP(t9  - t54);
+    c[55 * stride] = CLIP(t8  - t55a);
+    c[56 * stride] = CLIP(t7  - t56);
+    c[57 * stride] = CLIP(t6  - t57a);
+    c[58 * stride] = CLIP(t5  - t58);
+    c[59 * stride] = CLIP(t4  - t59a);
+    c[60 * stride] = CLIP(t3  - t60);
+    c[61 * stride] = CLIP(t2  - t61a);
+    c[62 * stride] = CLIP(t1  - t62);
+    c[63 * stride] = CLIP(t0  - t63a);
 }

-static void NOINLINE
-inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int range)
+static NOINLINE void
+inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                        const int min, const int max,
+                        int32_t *const out, const ptrdiff_t out_s)
 {
+    assert(in_s > 0 && out_s != 0);
    const int in0 = in[0 * in_s], in1 = in[1 * in_s];
    const int in2 = in[2 * in_s], in3 = in[3 * in_s];

@ -674,11 +801,12 @@ inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
                     in0 + in2 - in1;
 }

-static void NOINLINE
-inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+static NOINLINE void
+inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                        const int min, const int max,
+                        int32_t *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
+    assert(in_s > 0 && out_s != 0);
    const int in0 = in[0 * in_s], in1 = in[1 * in_s];
    const int in2 = in[2 * in_s], in3 = in[3 * in_s];
    const int in4 = in[4 * in_s], in5 = in[5 * in_s];
@ -707,15 +835,14 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
    t6a = (((3784 - 4096) * t7 -  1567         * t6 + 2048) >> 12) + t7;
    t7a = (( 1567         * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;

-    out[0 * out_s] = CLIP(  t0 + t2);
-    out[7 * out_s] = CLIP(-(t1 + t3));
-    t2             = CLIP(  t0 - t2);
-    t3             = CLIP(  t1 - t3);
-
-    out[1 * out_s] = CLIP(-(t4a + t6a));
-    out[6 * out_s] = CLIP(  t5a + t7a );
-    t6             = CLIP(  t4a - t6a );
-    t7             = CLIP(  t5a - t7a );
+    out[0 * out_s] =  CLIP(t0  + t2 );
+    out[7 * out_s] = -CLIP(t1  + t3 );
+    t2             =  CLIP(t0  - t2 );
+    t3             =  CLIP(t1  - t3 );
+    out[1 * out_s] = -CLIP(t4a + t6a);
+    out[6 * out_s] =  CLIP(t5a + t7a);
+    t6             =  CLIP(t4a - t6a);
+    t7             =  CLIP(t5a - t7a);

    out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
    out[4 * out_s] =   ((t2 - t3) * 181 + 128) >> 8;
@ -723,11 +850,12 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
    out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
 }

-static void NOINLINE
-inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
-              coef *const out, const ptrdiff_t out_s, const int max)
+static NOINLINE void
+inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+                         const int min, const int max,
+                         int32_t *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
+    assert(in_s > 0 && out_s != 0);
    const int in0  = in[ 0 * in_s], in1  = in[ 1 * in_s];
    const int in2  = in[ 2 * in_s], in3  = in[ 3 * in_s];
    const int in4  = in[ 4 * in_s], in5  = in[ 5 * in_s];
@ -806,22 +934,22 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
    t14  = ((t15a * (3784 - 4096) - t14a *  1567         + 2048) >> 12) + t15a;
    t15  = ((t15a *  1567         + t14a * (3784 - 4096) + 2048) >> 12) + t14a;

-    out[ 0 * out_s] = CLIP(  t0  + t2   );
-    out[15 * out_s] = CLIP(-(t1  + t3)  );
-    t2a             = CLIP(  t0  - t2   );
-    t3a             = CLIP(  t1  - t3   );
-    out[ 3 * out_s] = CLIP(-(t4a + t6a) );
-    out[12 * out_s] = CLIP(  t5a + t7a  );
-    t6              = CLIP(  t4a - t6a  );
-    t7              = CLIP(  t5a - t7a  );
-    out[ 1 * out_s] = CLIP(-(t8a + t10a));
-    out[14 * out_s] = CLIP(  t9a + t11a );
-    t10             = CLIP(  t8a - t10a );
-    t11             = CLIP(  t9a - t11a );
-    out[ 2 * out_s] = CLIP(  t12 + t14  );
-    out[13 * out_s] = CLIP(-(t13 + t15) );
-    t14a            = CLIP(  t12 - t14  );
-    t15a            = CLIP(  t13 - t15  );
+    out[ 0 * out_s] =  CLIP(t0  + t2  );
+    out[15 * out_s] = -CLIP(t1  + t3  );
+    t2a             =  CLIP(t0  - t2  );
+    t3a             =  CLIP(t1  - t3  );
+    out[ 3 * out_s] = -CLIP(t4a + t6a );
+    out[12 * out_s] =  CLIP(t5a + t7a );
+    t6              =  CLIP(t4a - t6a );
+    t7              =  CLIP(t5a - t7a );
+    out[ 1 * out_s] = -CLIP(t8a + t10a);
+    out[14 * out_s] =  CLIP(t9a + t11a);
+    t10             =  CLIP(t8a - t10a);
+    t11             =  CLIP(t9a - t11a);
+    out[ 2 * out_s] =  CLIP(t12 + t14 );
+    out[13 * out_s] = -CLIP(t13 + t15 );
+    t14a            =  CLIP(t12 - t14 );
+    t15a            =  CLIP(t13 - t15 );

    out[ 7 * out_s] = -(((t2a  + t3a)  * 181 + 128) >> 8);
    out[ 8 * out_s] =   ((t2a  - t3a)  * 181 + 128) >> 8;
@ -833,67 +961,74 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
    out[10 * out_s] =   ((t14a - t15a) * 181 + 128) >> 8;
 }

-#define flip_inv_adst(sz) \
-static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
-                                  coef *const out, const ptrdiff_t out_s, const int range) \
+#define inv_adst_1d(sz) \
+void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                               const int min, const int max) \
 { \
-    inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
+    inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
+} \
+void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                   const int min, const int max) \
+{ \
+    inv_adst##sz##_1d_internal_c(c, stride, min, max, \
+                                 &c[(sz - 1) * stride], -stride); \
 }

-flip_inv_adst(4)
-flip_inv_adst(8)
-flip_inv_adst(16)
+inv_adst_1d( 4)
+inv_adst_1d( 8)
+inv_adst_1d(16)

-#undef flip_inv_adst
+#undef inv_adst_1d

-static void NOINLINE
-inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,
-                 coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                              const int min, const int max)
 {
-    for (int i = 0; i < 4; i++)
-        out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);
+    assert(stride > 0);
+    for (int i = 0; i < 4; i++) {
+        const int in = c[stride * i];
+        c[stride * i] = in + ((in * 1697 + 2048) >> 12);
+    }
 }

-static void NOINLINE
-inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,
-                 coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                              const int min, const int max)
 {
+    assert(stride > 0);
    for (int i = 0; i < 8; i++)
-        out[out_s * i] = in[in_s * i] * 2;
+        c[stride * i] *= 2;
 }

-static void NOINLINE
-inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,
-                  coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
-    for (int i = 0; i < 16; i++)
-        out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);
+    assert(stride > 0);
+    for (int i = 0; i < 16; i++) {
+        const int in = c[stride * i];
+        c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
+    }
 }

-static void NOINLINE
-inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,
-                  coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
+    assert(stride > 0);
    for (int i = 0; i < 32; i++)
-        out[out_s * i] = in[in_s * i] * 4;
+        c[stride * i] *= 4;
 }

-static void NOINLINE
-inv_wht4_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s,
-            const int pass)
-{
-    const int sh = 2 * !pass;
-    const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;
-    const int in2 = in[2 * in_s] >> sh, in3 = in[3 * in_s] >> sh;
+void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
+    assert(stride > 0);
+    const int in0 = c[0 * stride], in1 = c[1 * stride];
+    const int in2 = c[2 * stride], in3 = c[3 * stride];
+
    const int t0 = in0 + in1;
    const int t2 = in2 - in3;
    const int t4 = (t0 - t2) >> 1;
    const int t3 = t4 - in3;
    const int t1 = t4 - in1;

-    out[0 * out_s] = t0 - t3;
-    out[1 * out_s] = t3;
-    out[2 * out_s] = t1;
-    out[3 * out_s] = t2 + t1;
+    c[0 * stride] = t0 - t3;
+    c[1 * stride] = t3;
+    c[2 * stride] = t1;
+    c[3 * stride] = t2 + t1;
 }
--- a/third_party/dav1d/src/itx_1d.h
+++ b/third_party/dav1d/src/itx_1d.h
@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_SRC_ITX_1D_H
+#define DAV1D_SRC_ITX_1D_H
+
+#define decl_itx_1d_fn(name) \
+void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
+typedef decl_itx_1d_fn(*itx_1d_fn);
+
+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+
+void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
+
+#endif /* DAV1D_SRC_ITX_1D_H */
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@ -1,6 +1,6 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -35,78 +35,68 @@
 #include "common/intops.h"

 #include "src/itx.h"
+#include "src/itx_1d.h"

-#include "src/itx_1d.c"
-
-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
-                          coef *out, ptrdiff_t out_s, const int range);
-
-static void NOINLINE
-inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
-               coef *const coeff, const int eob,
-               const int w, const int h, const int shift,
+static NOINLINE void
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
+               const int eob, const int w, const int h, const int shift,
               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
               const int has_dconly HIGHBD_DECL_SUFFIX)
 {
-    int i, j;
-    assert((h >= 4 && h <= 64) && (w >= 4 && w <= 64));
+    assert(w >= 4 && w <= 64);
+    assert(h >= 4 && h <= 64);
+    assert(eob >= 0);
+
    const int is_rect2 = w * 2 == h || h * 2 == w;
-    const int bitdepth = bitdepth_from_max(bitdepth_max);
    const int rnd = (1 << shift) >> 1;

-    if (has_dconly && eob == 0) {
+    if (eob < has_dconly) {
        int dc = coeff[0];
        coeff[0] = 0;
        if (is_rect2)
-            dc = (dc * 2896 + 2048) >> 12;
-        dc = (dc * 2896 + 2048) >> 12;
+            dc = (dc * 181 + 128) >> 8;
+        dc = (dc * 181 + 128) >> 8;
        dc = (dc + rnd) >> shift;
-        dc = (dc * 2896 + 2048) >> 12;
-        dc = (dc + 8) >> 4;
-        for (j = 0; j < h; j++)
-            for (i = 0; i < w; i++)
-                dst[i + j * PXSTRIDE(stride)] =
-                    iclip_pixel(dst[i + j * PXSTRIDE(stride)] + dc);
+        dc = (dc * 181 + 128 + 2048) >> 12;
+        for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+            for (int x = 0; x < w; x++)
+                dst[x] = iclip_pixel(dst[x] + dc);
        return;
    }
-    assert(eob > 0 || (eob == 0 && !has_dconly));

-    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
-    // Maximum value for h and w is 64
-    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
-    const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
-    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
-
-    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
-    for (i = 0; i < sh; i++) {
-        if (w != sw || is_rect2) {
-            for (j = 0; j < sw; j++) {
-                in_mem[j] = coeff[i + j * sh];
-                if (is_rect2)
-                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
-            }
-            first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
-        } else {
-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
-        }
-        for (j = 0; j < w; j++)
+    const int sh = imin(h, 32), sw = imin(w, 32);
 #if BITDEPTH == 8
-            tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
+    const int row_clip_min = INT16_MIN;
+    const int col_clip_min = INT16_MIN;
 #else
-            tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
-                                   -col_clip_max - 1, col_clip_max);
+    const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
+    const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
 #endif
+    const int row_clip_max = ~row_clip_min;
+    const int col_clip_max = ~col_clip_min;
+
+    int32_t tmp[64 * 64], *c = tmp;
+    for (int y = 0; y < sh; y++, c += w) {
+        if (is_rect2)
+            for (int x = 0; x < sw; x++)
+                c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
+        else
+            for (int x = 0; x < sw; x++)
+                c[x] = coeff[y + x * sh];
+        first_1d_fn(c, 1, row_clip_min, row_clip_max);
    }

-    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
-    for (i = 0; i < w; i++) {
-        second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
-        for (j = 0; j < h; j++)
-            dst[i + j * PXSTRIDE(stride)] =
-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
-                            ((out[j] + 8) >> 4));
-    }
-    memset(coeff, 0, sizeof(*coeff) * sh * sw);
+    memset(coeff, 0, sizeof(*coeff) * sw * sh);
+    for (int i = 0; i < w * sh; i++)
+        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
+
+    for (int x = 0; x < w; x++)
+        second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
+
+    c = tmp;
+    for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
 }

 #define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
@ -118,8 +108,8 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
                                               HIGHBD_DECL_SUFFIX) \
 { \
    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
-                   inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
-                   HIGHBD_TAIL_SUFFIX); \
+                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
+                   has_dconly HIGHBD_TAIL_SUFFIX); \
 }

 #define inv_txfm_fn64(w, h, shift) \
@ -173,23 +163,21 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
                                       coef *const coeff, const int eob
                                       HIGHBD_DECL_SUFFIX)
 {
-    const int bitdepth = bitdepth_from_max(bitdepth_max);
-    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
-    const int col_clip_min = -col_clip_max - 1;
-    coef tmp[4 * 4], out[4];
-
-    for (int i = 0; i < 4; i++)
-        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
-    for (int k = 0; k < 4 * 4; k++)
-        tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
-
-    for (int i = 0; i < 4; i++) {
-        inv_wht4_1d(&tmp[i], 4, out, 1, 1);
-        for (int j = 0; j < 4; j++)
-            dst[i + j * PXSTRIDE(stride)] =
-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
+    int32_t tmp[4 * 4], *c = tmp;
+    for (int y = 0; y < 4; y++, c += 4) {
+        for (int x = 0; x < 4; x++)
+            c[x] = coeff[y + x * 4] >> 2;
+        dav1d_inv_wht4_1d_c(c, 1);
    }
    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+    for (int x = 0; x < 4; x++)
+        dav1d_inv_wht4_1d_c(&tmp[x], 4);
+
+    c = tmp;
+    for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
+        for (int x = 0; x < 4; x++)
+            dst[x] = iclip_pixel(dst[x] + *c++);
 }

 COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -43,8 +43,8 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
                      const uint16_t *const tx_masks)
 {
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
-    const int is_split =
-        depth > 1 ? 0 : (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
+    const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
+        (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;

    if (is_split) {
        const enum RectTxfmSize sub = t_dim->sub;
@ -350,6 +350,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
                                const int bx, const int by,
                                const int iw, const int ih,
                                const int skip, const enum BlockSize bs,
+                                const enum RectTxfmSize max_ytx,
                                const uint16_t *const tx_masks,
                                const enum RectTxfmSize uvtx,
                                const enum Dav1dPixelLayout layout,
@ -373,7 +374,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
        }

        mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
-                         dav1d_max_txfm_size_for_bs[bs][0], tx_masks, ay, ly);
+                         max_ytx, tx_masks, ay, ly);
    }

    if (!auv) return;
--- a/third_party/dav1d/src/lf_mask.h
+++ b/third_party/dav1d/src/lf_mask.h
@ -72,8 +72,8 @@ void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                const ptrdiff_t b4_stride,
                                const uint8_t (*level)[8][2], int bx, int by,
                                int iw, int ih, int skip_inter,
-                                enum BlockSize bs, const uint16_t *tx_mask,
-                                enum RectTxfmSize uvtx,
+                                enum BlockSize bs, enum RectTxfmSize max_ytx,
+                                const uint16_t *tx_mask, enum RectTxfmSize uvtx,
                                enum Dav1dPixelLayout layout, uint8_t *ay,
                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
 void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -31,12 +31,17 @@
 #include <errno.h>
 #include <string.h>

+#ifdef __linux__
+#include <dlfcn.h>
+#endif
+
 #include "dav1d/dav1d.h"
 #include "dav1d/data.h"

 #include "common/mem.h"
 #include "common/validate.h"

+#include "src/cpu.h"
 #include "src/fg_apply.h"
 #include "src/internal.h"
 #include "src/log.h"
@ -47,10 +52,11 @@
 #include "src/wedge.h"

 static COLD void init_internal(void) {
-    dav1d_init_wedge_masks();
+    dav1d_init_cpu();
    dav1d_init_interintra_masks();
    dav1d_init_qm_tables();
    dav1d_init_thread();
+    dav1d_init_wedge_masks();
 }

 COLD const char *dav1d_version(void) {
@ -73,6 +79,22 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {

 static void close_internal(Dav1dContext **const c_out, int flush);

+NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
+static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
+#if defined(__linux__) && defined(HAVE_DLSYM)
+    /* glibc has an issue where the size of the TLS is subtracted from the stack
+     * size instead of allocated separately. As a result the specified stack
+     * size may be insufficient when used in an application with large amounts
+     * of TLS data. The following is a workaround to compensate for that.
+     * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
+    size_t (*const get_minstack)(const pthread_attr_t*) =
+        dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
+    if (get_minstack)
+        return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
+#endif
+    return 0;
+}
+
 COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
    static pthread_once_t initted = PTHREAD_ONCE_INIT;
    pthread_once(&initted, init_internal);
@ -92,7 +114,9 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {

    pthread_attr_t thread_attr;
    if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
-    pthread_attr_setstacksize(&thread_attr, 1024 * 1024);
+    size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
+
+    pthread_attr_setstacksize(&thread_attr, stack_size);

    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
    if (!c) goto error;
@ -124,17 +148,15 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
    memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
    if (c->n_fc > 1) {
        c->frame_thread.out_delayed =
-            malloc(sizeof(*c->frame_thread.out_delayed) * c->n_fc);
+            calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
        if (!c->frame_thread.out_delayed) goto error;
-        memset(c->frame_thread.out_delayed, 0,
-               sizeof(*c->frame_thread.out_delayed) * c->n_fc);
    }
    for (int n = 0; n < s->n_frame_threads; n++) {
        Dav1dFrameContext *const f = &c->fc[n];
        f->c = c;
        f->lf.last_sharpness = -1;
        f->n_tc = s->n_tile_threads;
-        f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 32);
+        f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
        if (!f->tc) goto error;
        memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
        if (f->n_tc > 1) {
@ -512,7 +534,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
        free(f->lf.level);
        free(f->lf.tx_lpf_right_edge[0]);
        if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
-        dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
+        dav1d_free_aligned(f->lf.cdef_line_buf);
        dav1d_free_aligned(f->lf.lr_lpf_line[0]);
    }
    dav1d_free_aligned(c->fc);
--- a/third_party/dav1d/src/looprestoration.h
+++ b/third_party/dav1d/src/looprestoration.h
@ -72,8 +72,8 @@ typedef struct Dav1dLoopRestorationDSPContext {
    selfguided_fn selfguided;
 } Dav1dLoopRestorationDSPContext;

-bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
+bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
+bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
 bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
 bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);

--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -573,13 +573,13 @@ static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
    }
 }

-COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
+COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
    c->wiener = wiener_c;
    c->selfguided = selfguided_c;

 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
-    bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
+    bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
 #elif ARCH_PPC64LE
    bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
 #elif ARCH_X86
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -35,6 +35,7 @@ libdav1d_sources = files(
    'dequant_tables.c',
    'getbits.c',
    'intra_edge.c',
+    'itx_1d.c',
    'lf_mask.c',
    'log.c',
    'msac.c',
@ -101,22 +102,47 @@ if is_asm_enabled
        )
        if host_machine.cpu_family() == 'aarch64'
            libdav1d_sources += files(
-                'arm/64/cdef.S',
-                'arm/64/ipred.S',
-                'arm/64/itx.S',
-                'arm/64/loopfilter.S',
-                'arm/64/looprestoration.S',
-                'arm/64/mc.S',
+                'arm/64/looprestoration_common.S',
                'arm/64/msac.S',
            )
+
+            if dav1d_bitdepths.contains('8')
+                libdav1d_sources += files(
+                    'arm/64/cdef.S',
+                    'arm/64/ipred.S',
+                    'arm/64/itx.S',
+                    'arm/64/loopfilter.S',
+                    'arm/64/looprestoration.S',
+                    'arm/64/mc.S',
+                )
+            endif
+
+            if dav1d_bitdepths.contains('16')
+                libdav1d_sources += files(
+                    'arm/64/cdef16.S',
+                    'arm/64/loopfilter16.S',
+                    'arm/64/looprestoration16.S',
+                    'arm/64/mc16.S',
+                )
+            endif
        elif host_machine.cpu_family().startswith('arm')
            libdav1d_sources += files(
-                'arm/32/cdef.S',
-                'arm/32/ipred.S',
-                'arm/32/loopfilter.S',
-                'arm/32/looprestoration.S',
-                'arm/32/mc.S',
            )
+
+            if dav1d_bitdepths.contains('8')
+                libdav1d_sources += files(
+                    'arm/32/cdef.S',
+                    'arm/32/ipred.S',
+                    'arm/32/loopfilter.S',
+                    'arm/32/looprestoration.S',
+                    'arm/32/mc.S',
+                )
+            endif
+
+            if dav1d_bitdepths.contains('16')
+                libdav1d_sources += files(
+                )
+            endif
        endif
    elif host_machine.cpu_family().startswith('x86')

@ -124,6 +150,12 @@ if is_asm_enabled
            'x86/cpu.c',
        )

+        if host_machine.cpu_family() == 'x86_64'
+            libdav1d_sources += files(
+                'x86/msac_init.c',
+            )
+        endif
+
        libdav1d_tmpl_sources += files(
            'x86/cdef_init_tmpl.c',
            'x86/film_grain_init_tmpl.c',
@ -150,6 +182,7 @@ if is_asm_enabled
                'x86/looprestoration.asm',
                'x86/mc.asm',
                'x86/cdef_sse.asm',
+                'x86/film_grain_ssse3.asm',
                'x86/ipred_ssse3.asm',
                'x86/itx_ssse3.asm',
                'x86/loopfilter_ssse3.asm',
@ -278,6 +311,7 @@ libdav1d = library('dav1d',
        stdatomic_dependency,
        thread_dependency,
        thread_compat_dep,
+        libdl_dependency,
        ],
    c_args : [stackalign_flag, api_export_flags],
    version : dav1d_soname_version,
@ -285,6 +319,10 @@ libdav1d = library('dav1d',
    install : true,
 )

+dav1d_dep = declare_dependency(link_with: libdav1d,
+    include_directories : include_directories('../include/dav1d')
+)
+
 #
 # Generate pkg-config .pc file
 #
--- a/third_party/dav1d/src/msac.c
+++ b/third_party/dav1d/src/msac.c
@ -196,5 +196,12 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
    s->rng = 0x8000;
    s->cnt = -15;
    s->allow_update_cdf = !disable_cdf_update_flag;
+
+#if ARCH_X86_64 && HAVE_ASM
+    s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+
+    dav1d_msac_init_x86(s);
+#endif
+
    ctx_refill(s);
 }
--- a/third_party/dav1d/src/msac.h
+++ b/third_party/dav1d/src/msac.h
@ -31,7 +31,7 @@
 #include <stdint.h>
 #include <stdlib.h>

-#include "common/attributes.h"
+#include "common/intops.h"

 typedef size_t ec_win;

@ -42,6 +42,10 @@ typedef struct MsacContext {
    unsigned rng;
    int cnt;
    int allow_update_cdf;
+
+#if ARCH_X86_64 && HAVE_ASM
+    unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
+#endif
 } MsacContext;

 #if HAVE_ASM
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -917,10 +917,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    hdr->skip_mode_allowed = 0;
    if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
        const unsigned poc = hdr->frame_offset;
-        unsigned off_before[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
+        unsigned off_before = 0xFFFFFFFFU;
        int off_after = -1;
-        int off_before_idx[2], off_after_idx;
-        off_before_idx[0] = 0;
+        int off_before_idx, off_after_idx;
        for (int i = 0; i < 7; i++) {
            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
            const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
@ -933,36 +932,42 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                    off_after = refpoc;
                    off_after_idx = i;
                }
-            } else if (diff < 0) {
-                if (off_before[0] == 0xFFFFFFFFU ||
-                    get_poc_diff(seqhdr->order_hint_n_bits,
-                                 refpoc, off_before[0]) > 0)
-                {
-                    off_before[1] = off_before[0];
-                    off_before[0] = refpoc;
-                    off_before_idx[1] = off_before_idx[0];
-                    off_before_idx[0] = i;
-                } else if (refpoc != off_before[0] &&
-                           (off_before[1] == 0xFFFFFFFFU ||
-                            get_poc_diff(seqhdr->order_hint_n_bits,
-                                         refpoc, off_before[1]) > 0))
-                {
-                    off_before[1] = refpoc;
-                    off_before_idx[1] = i;
-                }
+            } else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
+                                    get_poc_diff(seqhdr->order_hint_n_bits,
+                                                 refpoc, off_before) > 0))
+            {
+                off_before = refpoc;
+                off_before_idx = i;
            }
        }

-        if (off_before[0] != 0xFFFFFFFFU && off_after != -1) {
-            hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_after_idx);
-            hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_after_idx);
-            hdr->skip_mode_allowed = 1;
-        } else if (off_before[0] != 0xFFFFFFFFU &&
-                   off_before[1] != 0xFFFFFFFFU)
-        {
-            hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_before_idx[1]);
-            hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_before_idx[1]);
+        if (off_before != 0xFFFFFFFFU && off_after != -1) {
+            hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
+            hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
            hdr->skip_mode_allowed = 1;
+        } else if (off_before != 0xFFFFFFFFU) {
+            unsigned off_before2 = 0xFFFFFFFFU;
+            int off_before2_idx;
+            for (int i = 0; i < 7; i++) {
+                if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+                const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+                if (get_poc_diff(seqhdr->order_hint_n_bits,
+                                 refpoc, off_before) < 0) {
+                    if (off_before2 == 0xFFFFFFFFU ||
+                        get_poc_diff(seqhdr->order_hint_n_bits,
+                                     refpoc, off_before2) > 0)
+                    {
+                        off_before2 = refpoc;
+                        off_before2_idx = i;
+                    }
+                }
+            }
+
+            if (off_before2 != 0xFFFFFFFFU) {
+                hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
+                hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
+                hdr->skip_mode_allowed = 1;
+            }
        }
    }
    hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
--- a/third_party/dav1d/src/ppc/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/ppc/cdef_init_tmpl.c
@ -53,7 +53,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,

 static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
                           const uint8_t *src, const ptrdiff_t src_stride,
-                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const uint8_t (*left)[2], const uint8_t *const top,
                           const int w, const int h,
                           const enum CdefEdgeFlags edges)
 {
@ -70,8 +70,8 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
        l1 = fill;
        y_start = 0;
    } else {
-        l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
-        l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
+        l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+        l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
    }

    vec_st(l0, 0, tmp - 2 * 8);
@ -115,7 +115,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,

 static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
                           const uint8_t *src, const ptrdiff_t src_stride,
-                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const uint8_t (*left)[2], const uint8_t *const top,
                           const int w, const int h,
                           const enum CdefEdgeFlags edges)
 {
@ -134,8 +134,8 @@ static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
        l1l = fill;
        y_start = 0;
    } else {
-        u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
-        u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
+        u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+        u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
        l0h = u8h_to_u16(l0);
        l0l = u8l_to_u16(l0);
        l1h = u8h_to_u16(l1);
@ -275,7 +275,7 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {

 static inline void
 filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
-           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const pixel (*left)[2], const pixel *const top,
           const int w, const int h, const int pri_strength,
           const int sec_strength, const int dir,
           const int damping, const enum CdefEdgeFlags edges,
@ -364,7 +364,7 @@ filter_4xN(pixel *dst, const ptrdiff_t dst_stride,

 static inline void
 filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
-           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const pixel (*left)[2], const pixel *const top,
           const int w, const int h, const int pri_strength,
           const int sec_strength, const int dir,
           const int damping, const enum CdefEdgeFlags edges,
@ -456,7 +456,7 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
 static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
                                        const ptrdiff_t dst_stride, \
                                        const pixel (*left)[2], \
-                                        /*const*/ pixel *const top[2], \
+                                        const pixel *const top, \
                                        const int pri_strength, \
                                        const int sec_strength, \
                                        const int dir, \
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -70,10 +70,10 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
-        int ca, cl;
+        unsigned ca, cl;

-#define MERGE_CTX(dir, type, mask) \
-        c##dir = !!((*(const type *) dir) & mask); \
+#define MERGE_CTX(dir, type, no_val) \
+        c##dir = *(const type *) dir != no_val; \
        break

        switch (t_dim->lw) {
@ -83,17 +83,17 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
         * and will therefore complain about the use of uninitialized variables
         * when compiled in debug mode if we put the default case at the end. */
        default: assert(0); /* fall-through */
-        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x3F);
-        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x3F3F);
-        case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
-        case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
+        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
+        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
+        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
+        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
        }
        switch (t_dim->lh) {
        default: assert(0); /* fall-through */
-        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x3F);
-        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x3F3F);
-        case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
-        case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
+        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
+        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
+        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
+        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
        }
 #undef MERGE_CTX

@ -352,13 +352,17 @@ static int decode_coefs(Dav1dTileContext *const t,
    if (lossless) {
        assert(t_dim->max == TX_4X4);
        *txtp = WHT_WHT;
-    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
-               t_dim->max + intra >= TX_64X64)
-    {
+    } else if (t_dim->max + intra >= TX_64X64) {
        *txtp = DCT_DCT;
    } else if (chroma) {
+        // inferred from either the luma txtp (inter) or a LUT (intra)
        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
                        get_uv_inter_txtp(t_dim, *txtp);
+    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
+        // In libaom, lossless is checked by a literal qidx == 0, but not all
+        // such blocks are actually lossless. The remainder gets an implicit
+        // transform type (for luma)
+        *txtp = DCT_DCT;
    } else {
        unsigned idx;
        if (intra) {
@ -1993,7 +1997,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
                                imin(sby * sbsz + n_blks, f->bh));
    }
-    if (f->frame_hdr->super_res.enabled) {
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
        const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
        for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
            const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
--- a/third_party/dav1d/src/ref_mvs.c
+++ b/third_party/dav1d/src/ref_mvs.c
@ -2094,10 +2094,7 @@ void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,

 AV1_COMMON *dav1d_alloc_ref_mv_common(void);
 AV1_COMMON *dav1d_alloc_ref_mv_common(void) {
-    AV1_COMMON *cm = malloc(sizeof(*cm));
-    if (!cm) return NULL;
-    memset(cm, 0, sizeof(*cm));
-    return cm;
+    return calloc(1, sizeof(AV1_COMMON));
 }

 void dav1d_free_ref_mv_common(AV1_COMMON *cm);
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -397,6 +397,21 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
    .delta = 0,
 };

+const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+    {  0 * 12 + 1,  0 * 12 + 2 }, // 2
+    {  0 * 12 + 1,  1 * 12 + 2 }, // 3
+    {  1 * 12 + 1,  2 * 12 + 2 }, // 4
+    {  1 * 12 + 0,  2 * 12 + 1 }, // 5
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+};
+
 const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
    { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
    { 2, 1,  80, 1438 }, { 2, 1,  70, 1295 }, { 2, 1, 58, 1177 },
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@ -105,6 +105,8 @@ static const unsigned interintra_allowed_mask =

 extern const Dav1dWarpedMotionParams dav1d_default_wm_params;

+extern const int8_t dav1d_cdef_directions[12][2];
+
 extern const int16_t dav1d_sgr_params[16][4];
 extern const uint8_t dav1d_sgr_x_by_x[256];

--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@ -30,6 +30,7 @@

 #if defined(_WIN32)

+#include <limits.h>
 #include <windows.h>

 #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
@ -72,9 +73,10 @@ static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
 }

 static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
-                                            const unsigned stack_size)
+                                            const size_t stack_size)
 {
-    attr->stack_size = stack_size;
+    if (stack_size > UINT_MAX) return 1;
+    attr->stack_size = (unsigned) stack_size;
    return 0;
 }

--- a/third_party/dav1d/src/x86/cdef.asm
+++ b/third_party/dav1d/src/x86/cdef.asm
--- a/third_party/dav1d/src/x86/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/x86/cdef_init_tmpl.c
@ -28,20 +28,16 @@
 #include "src/cpu.h"
 #include "src/cdef.h"

-decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
-decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
-decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
-decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);
+#define decl_cdef_size_fn(sz) \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
+    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)

-decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
-decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
-decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
-decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);
-
-decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
-decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
-decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
-decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);
+decl_cdef_size_fn(4x4);
+decl_cdef_size_fn(4x8);
+decl_cdef_size_fn(8x8);

 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
 decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
@ -76,12 +72,21 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
    c->fb[2] = dav1d_cdef_filter_4x4_sse4;
 #endif

+#if ARCH_X86_64
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

-#if BITDEPTH == 8 && ARCH_X86_64
+#if BITDEPTH == 8
    c->dir = dav1d_cdef_dir_avx2;
    c->fb[0] = dav1d_cdef_filter_8x8_avx2;
    c->fb[1] = dav1d_cdef_filter_4x8_avx2;
    c->fb[2] = dav1d_cdef_filter_4x4_avx2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+    c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
+#endif
+
 #endif
 }
--- a/third_party/dav1d/src/x86/cdef_sse.asm
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@ -364,26 +364,19 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
 .body_done:

    ; top
- %if ARCH_X86_64
-    DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
- %else
-    DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge
- %endif
    LOAD_ARG32     top
    test         edged, 4                    ; have_top
    jz .no_top
-    mov          top1q, [top2q+0*gprsize]
-    mov          top2q, [top2q+1*gprsize]
    test         edged, 1                    ; have_left
    jz .top_no_left
    test         edged, 2                    ; have_right
    jz .top_no_right
 %if %1 == 4
-    PMOVZXBW        m0, [top1q-2]
-    PMOVZXBW        m1, [top2q-2]
+    PMOVZXBW        m0, [topq+strideq*0-2]
+    PMOVZXBW        m1, [topq+strideq*1-2]
 %else
-    movu            m0, [top1q-4]
-    movu            m1, [top2q-4]
+    movu            m0, [topq+strideq*0-4]
+    movu            m1, [topq+strideq*1-4]
    punpckhbw       m2, m0, m15
    punpcklbw       m0, m15
    punpckhbw       m3, m1, m15
@ -396,13 +389,13 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    jmp .top_done
 .top_no_right:
 %if %1 == 4
-    PMOVZXBW        m0, [top1q-%1]
-    PMOVZXBW        m1, [top2q-%1]
+    PMOVZXBW        m0, [topq+strideq*0-%1]
+    PMOVZXBW        m1, [topq+strideq*1-%1]
    movu [px-2*%3-4*2], m0
    movu [px-1*%3-4*2], m1
 %else
-    movu            m0, [top1q-%1]
-    movu            m1, [top2q-%2]
+    movu            m0, [topq+strideq*0-%1]
+    movu            m1, [topq+strideq*1-%2]
    punpckhbw       m2, m0, m15
    punpcklbw       m0, m15
    punpckhbw       m3, m1, m15
@ -419,11 +412,11 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    test         edged, 2                   ; have_right
    jz .top_no_left_right
 %if %1 == 4
-    PMOVZXBW        m0, [top1q]
-    PMOVZXBW        m1, [top2q]
+    PMOVZXBW        m0, [topq+strideq*0]
+    PMOVZXBW        m1, [topq+strideq*1]
 %else
-    movu            m0, [top1q]
-    movu            m1, [top2q]
+    movu            m0, [topq+strideq*0]
+    movu            m1, [topq+strideq*1]
    punpckhbw       m2, m0, m15
    punpcklbw       m0, m15
    punpckhbw       m3, m1, m15
@ -437,8 +430,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    mov dword [px-1*%3-4], OUT_OF_BOUNDS
    jmp .top_done
 .top_no_left_right:
-    PMOVZXBW        m0, [top1q], %1 == 4
-    PMOVZXBW        m1, [top2q], %1 == 4
+    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
+    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
    mova     [px-2*%3], m0
    mova     [px-1*%3], m1
    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
@ -630,9 +623,9 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    sub        secdmpd, dampingd
    xor       dampingd, dampingd
    neg        pridmpd
-    cmovl      pridmpd, dampingd
+    cmovs      pridmpd, dampingd
    neg        secdmpd
-    cmovl      secdmpd, dampingd
+    cmovs      secdmpd, dampingd
 %if ARCH_X86_64
    mov       [rsp+ 0], pridmpq                 ; pri_shift
    mov       [rsp+16], secdmpq                 ; sec_shift
--- a/third_party/dav1d/src/x86/cpu.c
+++ b/third_party/dav1d/src/x86/cpu.c
@ -33,37 +33,44 @@

 #include "src/x86/cpu.h"

-void dav1d_cpu_cpuid(uint32_t *info, int leaf);
-uint64_t dav1d_cpu_xgetbv(int xcr);
+typedef struct {
+    uint32_t eax, ebx, ecx, edx;
+} CpuidRegisters;
+
+void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
+uint64_t dav1d_cpu_xgetbv(unsigned xcr);
+
+#define X(reg, mask) (((reg) & (mask)) == (mask))

 COLD unsigned dav1d_get_cpu_flags_x86(void) {
-    uint32_t info[4] = {0}, n_ids;
+    CpuidRegisters r = { 0 };
+    dav1d_cpu_cpuid(&r, 0, 0);
+    const unsigned max_leaf = r.eax;
    unsigned flags = 0;

-    dav1d_cpu_cpuid(info, 0);
-    n_ids = info[0];
-
-    if (n_ids >= 1) {
-        dav1d_cpu_cpuid(info, 1);
-        if (info[3] & (1 << 25)) flags |= DAV1D_X86_CPU_FLAG_SSE;
-        if (info[3] & (1 << 26)) flags |= DAV1D_X86_CPU_FLAG_SSE2;
-        if (info[2] & (1 <<  0)) flags |= DAV1D_X86_CPU_FLAG_SSE3;
-        if (info[2] & (1 <<  9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
-        if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
-        if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
+    if (max_leaf >= 1) {
+        dav1d_cpu_cpuid(&r, 1, 0);
+        if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
+            flags |= DAV1D_X86_CPU_FLAG_SSE2;
+            if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
+                flags |= DAV1D_X86_CPU_FLAG_SSSE3;
+                if (X(r.ecx, 0x00080000)) /* SSE4.1 */
+                    flags |= DAV1D_X86_CPU_FLAG_SSE41;
+            }
+        }
 #if ARCH_X86_64
        /* We only support >128-bit SIMD on x86-64. */
-        if (info[2] & (1 << 27)) /* OSXSAVE */ {
-            uint64_t xcr = dav1d_cpu_xgetbv(0);
-            if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
-                if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX;
-                if (n_ids >= 7) {
-                    dav1d_cpu_cpuid(info, 7);
-                    if ((info[1] & 0x00000128) == 0x00000128)
+        if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
+            const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
+            if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
+                if (max_leaf >= 7) {
+                    dav1d_cpu_cpuid(&r, 7, 0);
+                    if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
                        flags |= DAV1D_X86_CPU_FLAG_AVX2;
-                    if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ {
-                        if ((info[1] & 0xd0030000) == 0xd0030000)
-                            flags |= DAV1D_X86_CPU_FLAG_AVX512;
+                        if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
+                            if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
+                                flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
+                        }
                    }
                }
            }
--- a/third_party/dav1d/src/x86/cpu.h
+++ b/third_party/dav1d/src/x86/cpu.h
@ -29,15 +29,12 @@
 #define DAV1D_SRC_X86_CPU_H

 enum CpuFlags {
-    DAV1D_X86_CPU_FLAG_SSE = 1 << 0,
-    DAV1D_X86_CPU_FLAG_SSE2 = 1 << 1,
-    DAV1D_X86_CPU_FLAG_SSE3 = 1 << 2,
-    DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 3,
-    DAV1D_X86_CPU_FLAG_SSE41 = 1 << 4,
-    DAV1D_X86_CPU_FLAG_SSE42 = 1 << 5,
-    DAV1D_X86_CPU_FLAG_AVX = 1 << 6,
-    DAV1D_X86_CPU_FLAG_AVX2 = 1 << 7,
-    DAV1D_X86_CPU_FLAG_AVX512 = 1 << 8, /* F + CD + BW + DQ + VL */
+    DAV1D_X86_CPU_FLAG_SSE2      = 1 << 0,
+    DAV1D_X86_CPU_FLAG_SSSE3     = 1 << 1,
+    DAV1D_X86_CPU_FLAG_SSE41     = 1 << 2,
+    DAV1D_X86_CPU_FLAG_AVX2      = 1 << 3,
+    DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+                                            * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
 };

 unsigned dav1d_get_cpu_flags_x86(void);
--- a/third_party/dav1d/src/x86/cpuid.asm
+++ b/third_party/dav1d/src/x86/cpuid.asm
@ -27,12 +27,12 @@

 SECTION .text

-cglobal cpu_cpuid, 0, 5, 0, info, leaf
-    mov        r4, infomp
+cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
+    mov        r4, regsmp
    mov       eax, leafm
-    xor       ecx, ecx
+    mov       ecx, subleafm
 %if ARCH_X86_64
-    push      rbx
+    mov        r5, rbx
 %endif
    cpuid
    mov  [r4+4*0], eax
@ -40,7 +40,7 @@ cglobal cpu_cpuid, 0, 5, 0, info, leaf
    mov  [r4+4*2], ecx
    mov  [r4+4*3], edx
 %if ARCH_X86_64
-    pop       rbx
+    mov       rbx, r5
 %endif
    RET

--- a/third_party/dav1d/src/x86/film_grain.asm
+++ b/third_party/dav1d/src/x86/film_grain.asm
@ -44,6 +44,7 @@ round_vals: dw 32, 64, 128, 256, 512
 max: dw 255, 240, 235
 min: dw 0, 16
 pb_27_17_17_27: db 27, 17, 17, 27
+pw_1: dw 1

 %macro JMP_TABLE 1-*
    %xdefine %1_table %%table
@ -56,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
    %endrep
 %endmacro

+ALIGN 4
 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3

@ -69,8 +71,8 @@ struc FGData
    .scaling_shift:             resd 1
    .ar_coeff_lag:              resd 1
    .ar_coeffs_y:               resb 24
-    .ar_coeffs_uv:              resb 2 * 26 ; includes padding
-    .ar_coeff_shift:            resd 1
+    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
+    .ar_coeff_shift:            resq 1
    .grain_scale_shift:         resd 1
    .uv_mult:                   resd 2
    .uv_luma_mult:              resd 2
@ -169,9 +171,9 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    movsx         val0d, byte [bufq+xq]
    add           val3d, val0d
    cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
    cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
    mov  byte [bufq+xq], val3b
    ; keep val3d in-place as left for next x iteration
    inc              xq
@ -190,18 +192,19 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
 .ar2:
    DEFINE_ARGS buf, fg_data, shift
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
-    movd           xm14, [base+hmul_bits-10+shiftq*2]
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
    movq           xm15, [base+byte_blend+1]
    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
    pmovsxbw        xm9, xm9
-    DEFINE_ARGS buf, h, x
+    DEFINE_ARGS buf, fg_data, h, x
    pshufd         xm12, xm9, q0000
    pshufd         xm13, xm9, q1111
    pshufd         xm11, xm8, q3333
    pshufd         xm10, xm8, q2222
    pshufd          xm9, xm8, q1111
    pshufd          xm8, xm8, q0000
+    pmovzxwd       xm14, xm14
    sub            bufq, 82*73-(82*3+79)
    mov              hd, 70
 .y_loop_ar2:
@ -233,6 +236,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    paddd           xm4, xm6
    paddd           xm2, xm7
    paddd           xm2, xm4
+    paddd           xm2, xm14

    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
 .x_loop_ar2_inner:
@ -241,9 +245,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    paddd           xm3, xm2
    psrldq          xm1, 4                  ; y=0,x=0
    psrldq          xm2, 4                  ; shift top to next pixel
-    psrad           xm3, 5
-    packssdw        xm3, xm3
-    pmulhrsw        xm3, xm14
+    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
    paddw           xm3, xm1
    packsswb        xm3, xm3
    pextrb    [bufq+xq], xm3, 0
@ -274,7 +277,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    ALLOC_STACK   16*12
 %endif
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
-    movd           xm14, [base+hmul_bits-10+shiftq*2]
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
    movq           xm15, [base+byte_blend]
    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-7
    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_y+ 8]   ; cf8-15
@ -288,10 +291,11 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    pshufd          xm8, xm1, q3333
    pshufd          xm1, xm1, q0000
    pshufd          xm3, xm2, q1111
+    psrldq         xm13, xm2, 10
+    pinsrw          xm2, [pw_1], 5
    pshufd          xm4, xm2, q2222
-    psrldq          xm5, xm2, 10
    pshufd          xm2, xm2, q0000
-    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
+    pinsrw         xm13, [base+round_vals+shiftq*2-10], 3
    mova    [rsp+ 0*16], xm0
    mova    [rsp+ 1*16], xm9
    mova    [rsp+ 2*16], xm10
@ -303,9 +307,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    mova    [rsp+ 8*16], xm2
    mova    [rsp+ 9*16], xm3
    mova    [rsp+10*16], xm4
-    mova    [rsp+11*16], xm5
-    pxor           xm13, xm13
-    DEFINE_ARGS buf, h, x
+    DEFINE_ARGS buf, fg_data, h, x
    sub            bufq, 82*73-(82*3+79)
    mov              hd, 70
 .y_loop_ar3:
@ -374,7 +376,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data

    punpcklwd       xm6, xm7
    punpcklwd       xm8, xm9
-    punpcklwd       xm5, xm13
+    punpcklwd       xm5, xm14
    pmaddwd         xm6, [rsp+ 8*16]
    pmaddwd         xm8, [rsp+ 9*16]
    pmaddwd         xm5, [rsp+10*16]
@ -385,14 +387,13 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
 .x_loop_ar3_inner:
    pmovsxbw        xm2, xm1
-    pmaddwd         xm2, [rsp+16*11]
+    pmaddwd         xm2, xm13
    pshufd          xm3, xm2, q1111
    paddd           xm2, xm3                ; left+cur
    paddd           xm2, xm0                ; add top
    psrldq          xm0, 4
-    psrad           xm2, 5
-    packssdw        xm2, xm2
-    pmulhrsw        xm2, xm14
+    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw since we only care about one value
    packsswb        xm2, xm2
    pextrb    [bufq+xq], xm2, 0
    pslldq          xm2, 3
@ -468,7 +469,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 .ar0:
    INIT_YMM avx2
    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
-    imul            uvd, 25
+    imul            uvd, 28
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    movd            xm3, [base+hmul_bits+shiftq*2]
@ -538,7 +539,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 .ar1:
    INIT_XMM avx2
    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
-    imul            uvd, 25
+    imul            uvd, 28
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
@ -584,9 +585,9 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    movsx         val0d, byte [bufq+xq]
    add           val3d, val0d
    cmp           val3d, maxd
-    cmovg         val3d, maxd
+    cmovns        val3d, maxd
    cmp           val3d, mind
-    cmovl         val3d, mind
+    cmovs         val3d, mind
    mov  byte [bufq+xq], val3b
    ; keep val3d in-place as left for next x iteration
    inc              xq
@ -605,18 +606,17 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 .ar2:
    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
-    imul            uvd, 25
-    movd           xm15, [base+hmul_bits-10+shiftq*2]
+    imul            uvd, 28
+    vpbroadcastw   xm15, [base+round_vals-12+shiftq*2]
    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
+    pinsrw          xm9, [base+pw_1], 5
    vpbroadcastw    xm7, [base+hmul_bits+4]
    vpbroadcastd    xm6, [base+pb_1]
-    DEFINE_ARGS buf, bufy, h, x
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    pshufd         xm12, xm9, q0000
    pshufd         xm13, xm9, q1111
    pshufd         xm14, xm9, q2222
-    pxor           xm10, xm10
-    vpblendw       xm14, xm10, 10101010b
    pshufd         xm11, xm8, q3333
    pshufd         xm10, xm8, q2222
    pshufd          xm9, xm8, q1111
@ -660,7 +660,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    pmaddubsw       xm3, xm6, xm3
    paddw           xm0, xm3
    pmulhrsw        xm0, xm7
-    punpcklwd       xm0, xm0
+    punpcklwd       xm0, xm15
    pmaddwd         xm0, xm14
    paddd           xm2, xm0

@ -670,9 +670,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    pmaddwd         xm3, xm0, xm13
    paddd           xm3, xm2
    psrldq          xm2, 4                  ; shift top to next pixel
-    psrad           xm3, 5
-    packssdw        xm3, xm3
-    pmulhrsw        xm3, xm15
+    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
    pslldq          xm3, 2
    psrldq          xm0, 2
    paddw           xm3, xm0
@ -698,8 +696,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 %assign stack_size_padded (stack_size_padded+16*12)
 %assign stack_size (stack_size+16*12)
    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
-    imul            uvd, 25
-    movd           xm14, [base+hmul_bits-10+shiftq*2]
+    imul            uvd, 28
+    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
@ -719,6 +717,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    psrldq          xm5, xm2, 10
    pshufd          xm2, xm2, q0000
    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
+    pmovzxwd       xm14, xm14
    mova    [rsp+ 0*16], xm0
    mova    [rsp+ 1*16], xm9
    mova    [rsp+ 2*16], xm10
@ -733,7 +732,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    mova    [rsp+11*16], xm5
    vpbroadcastd   xm13, [base+pb_1]
    vpbroadcastw   xm15, [base+hmul_bits+4]
-    DEFINE_ARGS buf, bufy, h, x
+    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    sub            bufq, 82*38+44-(82*3+41)
    add           bufyq, 79+82*3
    mov              hd, 35
@ -817,6 +816,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    paddd           xm0, xm6
    paddd           xm8, xm5
    paddd           xm0, xm8
+    paddd           xm0, xm14

    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
 .x_loop_ar3_inner:
@ -826,9 +826,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
    paddd           xm2, xm3                ; left+cur
    paddd           xm2, xm0                ; add top
    psrldq          xm0, 4
-    psrad           xm2, 5
-    packssdw        xm2, xm2
-    pmulhrsw        xm2, xm14
+    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
+    ; don't packssdw, we only care about one value
    pslldq          xm2, 6
    vpblendw        xm1, xm2, 1000b
    packsswb        xm1, xm1
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/film_grain_init_tmpl.c
@ -28,6 +28,11 @@
 #include "src/cpu.h"
 #include "src/film_grain.h"

+decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
+decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+
 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
@ -36,6 +41,15 @@ decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
 COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->generate_grain_y = dav1d_generate_grain_y_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
+    c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+#endif
+
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

 #if BITDEPTH == 8 && ARCH_X86_64
--- a/third_party/dav1d/src/x86/film_grain_ssse3.asm
+++ b/third_party/dav1d/src/x86/film_grain_ssse3.asm
--- a/third_party/dav1d/src/x86/ipred.asm
+++ b/third_party/dav1d/src/x86/ipred.asm
@ -308,7 +308,7 @@ ALIGN function_align
    mov                 r6d, 0x5556
    mov                 r2d, 0x3334
    cmp                  hd, 32
-    cmovz               r6d, r2d
+    cmove               r6d, r2d
    movd                xm1, r6d
    pmulhuw             xm0, xm1
 .w8_end:
@ -1441,7 +1441,7 @@ ALIGN function_align
    mov                 r3d, 9
    mov                 tlq, rsp
    cmp                  hd, 4
-    cmova          maxbased, r3d
+    cmovne         maxbased, r3d
    vextracti128        xm1, m0, 1
    packuswb            xm0, xm1
    mova              [tlq], xm0
@ -1628,8 +1628,8 @@ ALIGN function_align
    sar                 r5d, 1
    mov                 tlq, rsp
    add                 r5d, 17 ; w*2 + (filter_strength == 3)
-    cmp                  hd, 8
-    cmova          maxbased, r5d
+    cmp                  hd, 16
+    cmovns         maxbased, r5d
    mov            [tlq+r5], r3b
    vextracti128        xm0, m1, 1
    packuswb            xm0, xm1
@ -1745,8 +1745,8 @@ ALIGN function_align
    sar                 r5d, 1
    mov                 tlq, rsp
    add                 r5d, 33
-    cmp                  hd, 16
-    cmova          maxbased, r5d
+    cmp                  hd, 32
+    cmovns         maxbased, r5d
    mov            [tlq+r5], r3b
    packuswb             m0, m1
    vpermq               m0, m0, q3120
@ -1812,7 +1812,7 @@ ALIGN function_align
    lea                 r3d, [hq+31]
    mov            maxbased, 63
    cmp                  hd, 32
-    cmovb          maxbased, r3d
+    cmovs          maxbased, r3d
    test             angled, 0x400 ; !enable_intra_edge_filter
    jnz .w32_main
    vbroadcasti128       m0, [pb_0to15]
@ -1889,8 +1889,8 @@ ALIGN function_align
    mov                 tlq, rsp
    mov            [tlq+65], r3b
    mov                 r3d, 65
-    cmp                  hd, 32
-    cmova          maxbased, r3d
+    cmp                  hd, 64
+    cmove          maxbased, r3d
    packuswb             m0, m2
    packuswb             m1, m6
    mova           [tlq+ 0], m0
@ -2294,7 +2294,7 @@ ALIGN function_align
    cmp                  hd, 16
    movu                xm2, [rsp+49]
    vinserti128          m2, [rsp+43], 1
-    cmovl               r5d, hd
+    cmovs               r5d, hd
    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
    movd                xm0, r5d
    vbroadcasti128       m1, [base+z_filter_s+12]
@ -2501,7 +2501,7 @@ ALIGN function_align
 .w8_filter_left_h16:
    mov                 r5d, 10
    cmp                  hd, 16
-    cmovl               r5d, hd
+    cmovs               r5d, hd
    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
    movd                xm0, r5d
    vpbroadcastb         m0, xm0
@ -2742,7 +2742,7 @@ ALIGN function_align
 .w16_filter_left_h16:
    mov                 r5d, 10
    cmp                  hd, 16
-    cmovl               r5d, hd
+    cmovs               r5d, hd
    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
    movd                xm0, r5d
    vpbroadcastb         m0, xm0
@ -3115,7 +3115,7 @@ ALIGN function_align
    mov                 r4d, 9
    lea                 tlq, [rsp+15]
    cmp                  wd, 4
-    cmova          maxbased, r4d
+    cmovne         maxbased, r4d
    vextracti128        xm1, m0, 1
    packuswb            xm0, xm1
    mova              [rsp], xm0
@ -3321,8 +3321,8 @@ ALIGN function_align
    sar                 r5d, 1
    lea                 tlq, [rsp+31]
    add                 r5d, 17
-    cmp                  wd, 8
-    cmova          maxbased, r5d
+    cmp                  wd, 16
+    cmovns         maxbased, r5d
    neg                  r5
    mov            [tlq+r5], r4b
    vextracti128        xm1, m0, 1
@ -3385,7 +3385,7 @@ ALIGN function_align
    sub              org_wd, 8
    lea                  r2, [strideq*3]
    lea                  r6, [dstq+org_wq]
-    cmovg              dstq, r6
+    cmovns             dstq, r6
    punpcklwd           xm1, xm2, xm0
    punpckhwd           xm2, xm0
    lea                  r6, [dstq+strideq*4]
@ -3493,8 +3493,8 @@ ALIGN function_align
    sar                 r5d, 1
    lea                 tlq, [rsp+63]
    add                 r5d, 33
-    cmp                  wd, 16
-    cmova          maxbased, r5d
+    cmp                  wd, 32
+    cmovns         maxbased, r5d
    neg                  r5
    mov            [tlq+r5], r4b
    packuswb             m0, m1
@ -3563,7 +3563,7 @@ ALIGN function_align
    sub              org_wd, 8
    lea                  r2, [strideq*3]
    lea                  r6, [dstq+org_wq]
-    cmovg              dstq, r6
+    cmovns             dstq, r6
    punpcklbw            m1, m2, m0
    punpckhbw            m2, m0
    lea                  r3, [strideq*5]
@ -3652,7 +3652,7 @@ ALIGN function_align
    movu               xm11, [tlq-66]    ; 56-63
    vinserti128         m11, [tlq-52], 1 ; 40-47
    sub                 r4d, wd ; 21-w
-    cmovg               r5d, r4d
+    cmovns              r5d, r4d
    movu               xm12, [tlq-58]    ; 48-55
    vinserti128         m12, [tlq-44], 1 ; 32-39
    sub                 r4d, 8 ; 13-w
@ -3721,8 +3721,8 @@ ALIGN function_align
    lea                 tlq, [rsp+95]
    mov            [tlq-65], r4b
    mov                 r4d, 65
-    cmp                  wd, 32
-    cmova          maxbased, r4d
+    cmp                  wd, 64
+    cmove          maxbased, r4d
    packuswb             m0, m2
    packuswb             m1, m6
    mova           [tlq-63], m0
@ -4553,7 +4553,7 @@ ALIGN function_align
    mov                 r6d, 0x5556
    mov                 r2d, 0x3334
    cmp                  hd, 32
-    cmovz               r6d, r2d
+    cmove               r6d, r2d
    movd                xm1, r6d
    pmulhuw             xm0, xm1
 .w8_end:
--- a/third_party/dav1d/src/x86/itx.asm
+++ b/third_party/dav1d/src/x86/itx.asm
@ -60,7 +60,6 @@ pw_16384:   times 2 dw 16384
 pw_1697x16: times 2 dw 1697*16
 pw_1697x8:  times 2 dw 1697*8
 pw_2896x8:  times 2 dw 2896*8
-pw_5793x4:  times 2 dw 5793*4

 pd_2048: dd 2048

@ -393,7 +392,7 @@ ALIGN function_align
    pmulhrsw             m0, [cq]
    vpbroadcastd         m1, [o(pw_1697x8)]
    pmulhrsw             m1, m0
-    paddw                m0, m1
+    paddsw               m0, m1
    punpcklwd            m0, m0
    punpckhdq            m1, m0, m0
    punpckldq            m0, m0
@ -405,7 +404,7 @@ ALIGN function_align
    vpbroadcastd         m2, [o(pw_2896x8)]
    packusdw             m0, m0
    pmulhrsw             m1, m0
-    paddw                m0, m1
+    paddsw               m0, m1
    pmulhrsw             m0, m2
    mova                 m1, m0
    jmp m(iadst_4x4_internal).end
@ -561,8 +560,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    vpbroadcastd         m3, [o(pw_1697x8)]
    pmulhrsw             m2, m3, m0
    pmulhrsw             m3, m1
-    paddw                m0, m2
-    paddw                m1, m3
+    paddsw               m0, m2
+    paddsw               m1, m3
    punpckhwd            m2, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m2
@ -572,8 +571,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    vpbroadcastd         m3, [o(pw_1697x8)]
    pmulhrsw             m2, m3, m0
    pmulhrsw             m3, m1
-    paddw                m0, m2
-    paddw                m1, m3
+    paddsw               m0, m2
+    paddsw               m1, m3
    jmp m(iadst_4x4_internal).end

 %macro WRITE_4X8 2 ; coefs[1-2]
@ -626,7 +625,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    punpckldq           xm0, xm1
    pmulhrsw            xm0, xm2
    pmulhrsw            xm3, xm0
-    paddw               xm0, xm3
+    paddsw              xm0, xm3
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm4
    vpbroadcastq         m0, xm0
@ -907,8 +906,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    punpckhwd            m1, m2
    pmulhrsw             m2, m4, m0
    pmulhrsw             m4, m1
-    paddw                m0, m2
-    paddw                m1, m4
+    paddsw               m0, m2
+    paddsw               m1, m4
    jmp                tx2q
 .pass2:
    vpbroadcastd         m4, [o(pw_4096)]
@ -925,8 +924,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpbroadcastd         m3, [o(pw_2048)]
    pmulhrsw             m0, m1
    pmulhrsw             m2, m0
-    paddw                m0, m0
-    paddw                m0, m2
+    paddsw               m0, m0
+    paddsw               m0, m2
    pmulhrsw             m3, m0
    punpcklwd            m1, m3, m3
    punpckhwd            m3, m3
@ -941,15 +940,16 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    movd                xm1, [cq+32*2]
    punpcklwd           xm1, [cq+32*3]
    vpbroadcastd        xm2, [o(pw_1697x8)]
-    vpbroadcastd        xm3, [o(pw_16384)]
-    vpbroadcastd        xm4, [o(pw_2896x8)]
+    vpbroadcastd        xm3, [o(pw_2896x8)]
+    vpbroadcastd        xm4, [o(pw_2048)]
    punpckldq           xm0, xm1
+    pcmpeqw             xm1, xm1
    pmulhrsw            xm2, xm0
-    paddw               xm0, xm2
+    pcmpeqw             xm1, xm0
+    pxor                xm0, xm1
+    pavgw               xm0, xm2
    pmulhrsw            xm0, xm3
-    psrlw               xm3, 3 ; pw_2048
    pmulhrsw            xm0, xm4
-    pmulhrsw            xm0, xm3
    vpbroadcastq         m0, xm0
    mova                 m1, m0
    mova                 m2, m0
@ -1283,26 +1283,33 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                 m3, [cq+32*0]
    mova                 m2, [cq+32*1]
    mova                 m4, [cq+32*2]
-    mova                 m0, [cq+32*3]
-    vpbroadcastd         m5, [o(pw_1697x8)]
+    mova                 m5, [cq+32*3]
+    vpbroadcastd         m8, [o(pw_1697x8)]
+    pcmpeqw              m0, m0 ; -1
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
-    punpcklwd            m2, m4, m0
-    punpckhwd            m4, m0
-    pmulhrsw             m0, m5, m1
-    pmulhrsw             m6, m5, m2
-    pmulhrsw             m7, m5, m3
-    pmulhrsw             m5, m4
-    paddw                m1, m0
-    paddw                m2, m6
-    paddw                m3, m7
-    paddw                m4, m5
-    vpbroadcastd         m5, [o(pw_16384)]
+    punpcklwd            m2, m4, m5
+    punpckhwd            m4, m5
+    pmulhrsw             m5, m8, m1
+    pmulhrsw             m6, m8, m2
+    pmulhrsw             m7, m8, m3
+    pmulhrsw             m8, m4
+    pcmpeqw              m9, m0, m1 ; we want to do a signed avg, but pavgw is
+    pxor                 m1, m9     ; unsigned. as long as both signs are equal
+    pcmpeqw              m9, m0, m2 ; it still works, but if the input is -1 the
+    pxor                 m2, m9     ; pmulhrsw result will become 0 which causes
+    pcmpeqw              m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
+    pxor                 m3, m9     ; we explicitly deal with that case here.
+    pcmpeqw              m0, m4
+    pxor                 m4, m0
+    pavgw                m1, m5
+    pavgw                m2, m6
+    pavgw                m3, m7
+    pavgw                m4, m8
    punpckldq            m0, m1, m2
    punpckhdq            m1, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
-    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
    jmp                tx2q
 .pass2:
    vpbroadcastd         m8, [o(pw_1697x16)]
@ -1311,11 +1318,11 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    pmulhrsw             m6, m8, m1
    pmulhrsw             m7, m8, m2
    pmulhrsw             m8, m3
-    REPX       {paddw x, x}, m0, m1, m2, m3
-    paddw                m0, m4
-    paddw                m1, m6
-    paddw                m2, m7
-    paddw                m3, m8
+    REPX      {paddsw x, x}, m0, m1, m2, m3
+    paddsw               m0, m4
+    paddsw               m1, m6
+    paddsw               m2, m7
+    paddsw               m3, m8
    jmp m(iadst_4x16_internal).end2

 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
@ -1353,7 +1360,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpbroadcastd        xm3, [o(pw_2048)]
    pmulhrsw            xm1, xm0
    pmulhrsw            xm2, xm1
-    paddw               xm1, xm2
+    paddsw              xm1, xm2
    pmulhrsw            xm1, xm3
    punpcklwd           xm1, xm1
    punpckldq           xm0, xm1, xm1
@ -1369,7 +1376,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpbroadcastd        xm3, [o(pw_2048)]
    packusdw            xm0, xm1
    pmulhrsw            xm0, xm2
-    paddw               xm0, xm0
+    paddsw              xm0, xm0
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm3
    vinserti128          m0, m0, xm0, 1
@ -1447,7 +1454,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    punpckhwd          m2, m0, m1
    punpcklwd          m0, m1
    pxor               m3, m3
-    psubw              m3, m2
+    psubsw             m3, m2
    punpckhwd          m1, m0, m3
    punpcklwd          m0, m3
    jmp              tx2q
@ -1492,7 +1499,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    punpckhwd            m1, m3, m2
    punpcklwd            m3, m2
    pxor                 m0, m0
-    psubw                m0, m1
+    psubsw               m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
    jmp                tx2q
@ -1520,15 +1527,15 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw             m2, m3
    punpcklwd            m0, m1, m2
    punpckhwd            m1, m2
-    paddw                m0, m0
-    paddw                m1, m1
+    paddsw               m0, m0
+    paddsw               m1, m1
    jmp                tx2q
 .pass2:
    vpbroadcastd         m3, [o(pw_1697x8)]
    pmulhrsw             m2, m3, m0
    pmulhrsw             m3, m1
-    paddw                m0, m2
-    paddw                m1, m3
+    paddsw               m0, m2
+    paddsw               m1, m3
    jmp m(iadst_8x4_internal).end

 %macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -1796,8 +1803,8 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw             m7, m1
    psrlw                m1, 3 ; pw_2048
    pmulhrsw             m2, m7
-    paddw                m7, m7
-    paddw                m7, m2
+    paddsw               m7, m7
+    paddsw               m7, m2
    pmulhrsw             m7, m1
    punpcklwd            m5, m7, m7
    punpckhwd            m7, m7
@ -2120,12 +2127,12 @@ INV_TXFM_8X16_FN identity, identity

 %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
    pmulhrsw            m%2, m%3, m%1
-%if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw
+%if %0 == 4 ; if downshifting by 1
    pmulhrsw            m%2, m%4
 %else
-    paddw               m%1, m%1
+    paddsw              m%1, m%1
 %endif
-    paddw               m%1, m%2
+    paddsw              m%1, m%2
 %endmacro

 cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
@ -2201,7 +2208,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    pmulhrsw            xm3, xm0
    psrlw               xm0, 3 ; pw_2048
    pmulhrsw            xm1, xm3
-    paddw               xm3, xm1
+    paddsw              xm3, xm1
    pmulhrsw            xm3, xm0
    punpcklwd           xm3, xm3
    punpckldq           xm1, xm3, xm3
@ -2228,7 +2235,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    vpbroadcastd         m1, [o(pw_2896x8)]
    pmulhrsw             m4, m0
    pmulhrsw             m4, m5
-    paddw                m0, m4
+    paddsw               m0, m4
    psrlw                m5, 3 ; pw_2048
    pmulhrsw             m0, m1
    pmulhrsw             m0, m5
@ -2503,10 +2510,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    pmulhrsw             m6, m7, m3
    pmulhrsw             m7, m4
    REPX   {pmulhrsw x, m8}, m0, m5, m6, m7
-    paddw                m1, m0
-    paddw                m2, m5
-    paddw                m3, m6
-    paddw                m4, m7
+    paddsw               m1, m0
+    paddsw               m2, m5
+    paddsw               m3, m6
+    paddsw               m4, m7
    punpcklqdq           m0, m1, m2
    punpckhqdq           m1, m2
    punpcklqdq           m2, m3, m4
@ -2518,10 +2525,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    pmulhrsw             m5, m7, m1
    pmulhrsw             m6, m7, m2
    pmulhrsw             m7, m3
-    paddw                m0, m4
-    paddw                m1, m5
-    paddw                m2, m6
-    paddw                m3, m7
+    paddsw               m0, m4
+    paddsw               m1, m5
+    paddsw               m2, m6
+    paddsw               m3, m7
    jmp m(iadst_16x4_internal).end

 %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -2581,7 +2588,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    pmulhrsw             m0, m4
    pmulhrsw             m5, m0
    pmulhrsw             m5, m2
-    paddw                m0, m5
+    paddsw               m0, m5
    psrlw                m2, 3 ; pw_2048
    pmulhrsw             m0, m4
    pmulhrsw             m0, m2
@ -2903,7 +2910,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    vpbroadcastd         m3, [o(pw_2896x8)]
    pmulhrsw             m3, [cq]
    vpbroadcastd         m0, [o(pw_8192)]
-    vpbroadcastd         m1, [o(pw_5793x4)]
+    vpbroadcastd         m1, [o(pw_1697x16)]
    vpbroadcastw         m4, [o(deint_shuf)] ; pb_0_1
    pcmpeqb              m5, m5
    pxor                 m6, m6
@ -2911,8 +2918,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    paddb                m5, m5 ; pb_m2
    pmulhrsw             m3, m0
    psrlw                m0, 2  ; pw_2048
-    psllw                m3, 2
-    pmulhrsw             m3, m1
+    IDTX16                3, 1, 1
    pmulhrsw             m3, m0
    mov                 r3d, 8
 .loop:
@ -2954,17 +2960,15 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    punpcklwd            m1, m3
    vpbroadcastd         m3, [o(pw_1697x16)]
    punpcklwd            m2, m4
-    vpbroadcastd         m4, [o(pw_8192)]
+    vpbroadcastd         m4, [o(pw_2896x8)]
    punpckldq            m1, m2
-    vpbroadcastd         m2, [o(pw_2896x8)]
+    vpbroadcastd         m2, [o(pw_2048)]
    punpcklqdq           m0, m1
    pmulhrsw             m3, m0
-    paddw                m0, m0
-    paddw                m0, m3
+    psraw                m3, 1
+    pavgw                m0, m3
    pmulhrsw             m0, m4
-    psrlw                m4, 2 ; pw_2048
    pmulhrsw             m0, m2
-    pmulhrsw             m0, m4
    mov                 r3d, 8
    jmp m(inv_txfm_add_identity_dct_16x4).end
 %endif
@ -3385,6 +3389,12 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
    jmp m(idct_16x16_internal).end3

+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+    pmulhrsw            m%2, m%3, m%1
+    psraw               m%2, 1
+    pavgw               m%1, m%2 ; signs are guaranteed to be equal
+%endmacro
+
 INV_TXFM_16X16_FN identity, dct,      15
 INV_TXFM_16X16_FN identity, identity

@ -3419,22 +3429,17 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    vinserti128         m13, [cq+16*13], 1
    mova               xm14, [cq-16* 1]
    vinserti128         m14, [cq+16*15], 1
-    REPX   {IDTX16 x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
+    REPX  {IDTX16B x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
                             10,  4, 11,  5, 12, 13, 14
    mova                xm6, [cq-16* 4]
    vinserti128          m6, [cq+16*12], 1
-    mova              [rsp], m1
-    IDTX16                6, 1, 7
-    mova                xm1, [cq-16* 2]
-    vinserti128          m1, [cq+16*14], 1
-    pmulhrsw             m7, m1
-    paddw                m1, m1
-    paddw                m7, m1
-    vpbroadcastd         m1, [o(pw_8192)]
-    REPX   {pmulhrsw x, m1}, m0,       m2,  m3,  m4,  m5,  m6,  m7, \
-                             m8,  m9,  m10, m11, m12, m13, m14, m15
-    pmulhrsw             m1, [rsp]
    mova              [rsp], m0
+    IDTX16B               6, 0, 7
+    mova                xm0, [cq-16* 2]
+    vinserti128          m0, [cq+16*14], 1
+    pmulhrsw             m7, m0
+    psraw                m7, 1
+    pavgw                m7, m0
    jmp m(idct_16x16_internal).pass1_end3
 ALIGN function_align
 .pass2:
@ -3447,8 +3452,8 @@ ALIGN function_align
    IDTX16                0, 1, 15
    mova                 m1, [rsp+32*0]
    pmulhrsw            m15, m1
-    paddw                m1, m1
-    paddw               m15, m1
+    paddsw               m1, m1
+    paddsw              m15, m1
    jmp m(idct_16x16_internal).end

 %define o_base iadst4_dconly2a + 128
@ -3963,7 +3968,7 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
    vinserti128          m6, m6, [cq+16* 9], 1
    vinserti128          m7, m7, [cq+16*13], 1
    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
-    REPX  {paddw     x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
    call .transpose8x8
    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
@ -4572,12 +4577,12 @@ ALIGN function_align
    IDCT32_PASS1_END      1,  9,  6,  7
    ret

-cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
+cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob
 %undef cmp
    lea                 rax, [o_base]
    vpbroadcastd         m9, [o(pw_2896x8)]
-    vpbroadcastd        m10, [o(pw_5793x4)]
-    vpbroadcastd        m11, [o(pw_5)]
+    vpbroadcastd        m10, [o(pw_1697x16)]
+    vpbroadcastd        m12, [o(pw_8192)]
    cmp                eobd, 43   ; if (eob > 43)
    setg                r4b       ;   iteration_count++
    cmp                eobd, 150  ; if (eob > 150)
@ -4586,6 +4591,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
    adc                 r4b, al   ;   iteration_count++
    lea                  r3, [strideq*3]
    mov                 rax, cq
+    paddw               m11, m12, m12 ; pw_16384
 .loop:
    mova                xm0,     [cq+64* 0]
    mova                xm1,     [cq+64* 1]
@ -4604,11 +4610,9 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
    vinserti128          m6, m6, [cq+64*14], 1
    vinserti128          m7, m7, [cq+64*15], 1
    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
-    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
-    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
-    REPX  {paddw    x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
-    REPX  {psraw    x, 3  }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
@ -4622,13 +4626,13 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
    pxor                 m0, m0
    mov                 r0d, 8
    cmp                  cq, rax
-    jg .zero_loop
+    ja .zero_loop
 .zero_loop_half:
    mova         [rax+64*0], m0
    mova         [rax+64*1], m0
-    mova         [rax+64*2], m0
-    mova         [rax+64*3], m0
    add                 rax, 64*4
+    mova         [rax-64*2], m0
+    mova         [rax-64*1], m0
    sub                 r0d, 2
    jg .zero_loop_half
    RET
@ -4646,7 +4650,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
 %undef cmp
    lea                 rax, [o_base]
    vpbroadcastd         m9, [o(pw_2896x8)]
-    vpbroadcastd        m10, [o(pw_1697x8)]
+    vpbroadcastd        m10, [o(pw_1697x16)]
    vpbroadcastd        m11, [o(pw_2048)]
    cmp                eobd, 35  ; if (eob > 35)
    setg                r4b      ;   iteration_count++
@ -4674,24 +4678,9 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
    vinserti128          m6, m6, [cq+32*14], 1
    vinserti128          m7, m7, [cq+32*15], 1
    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
-    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
-    pmulhrsw             m8, m10, m0
-    paddw                m0, m8
-    pmulhrsw             m8, m10, m1
-    paddw                m1, m8
-    pmulhrsw             m8, m10, m2
-    paddw                m2, m8
-    pmulhrsw             m8, m10, m3
-    paddw                m3, m8
-    pmulhrsw             m8, m10, m4
-    paddw                m4, m8
-    pmulhrsw             m8, m10, m5
-    paddw                m5, m8
-    pmulhrsw             m8, m10, m6
-    paddw                m6, m8
-    pmulhrsw             m8, m10, m7
-    paddw                m7, m8
+    REPX  {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
@ -4708,20 +4697,17 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
    lea                dstq, [r5+16]
    jmp .loop
 .ret:
-    sub                  cq, 32
+    sub                  cd, eax
    pxor                 m0, m0
-    mov                 r0d, 4
-    mov                 r1d, 8
-    cmp                  cq, rax
-    cmovg               r0d, r1d
+    add                  cd, 384
 .zero_loop:
    mova         [rax+32*0], m0
    mova         [rax+32*1], m0
    mova         [rax+32*2], m0
    mova         [rax+32*3], m0
    add                 rax, 32*4
-    dec                 r0d
-    jg .zero_loop
+    sub                  cd, 128
+    jge .zero_loop
    RET

 cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
@ -4859,7 +4845,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
    call m(inv_txfm_add_dct_dct_16x32).pass2_end
    lea               tmp3q, [tmp1q-32*32]
    cmp               tmp2q, tmp3q
-    jl .ret
+    jb .ret
    sub               tmp2q, 32*32
    sub                dstq, r3
    lea                  r2, [r2+r3+16]
--- a/third_party/dav1d/src/x86/itx_ssse3.asm
+++ b/third_party/dav1d/src/x86/itx_ssse3.asm
--- a/third_party/dav1d/src/x86/looprestoration.asm
+++ b/third_party/dav1d/src/x86/looprestoration.asm
@ -347,7 +347,7 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
    punpckhbw    xm0, xm1

    ; when we reach this, xm0 contains left two px in highest words
-    cmp           xq, -16
+    cmp           xd, -16
    jle .loop_x
 .partial_load_and_extend:
    vpbroadcastb  m3, [srcq-1]
@ -396,17 +396,17 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
    ; else if x < xlimd we extend from previous load (this implies have_right=0)
    ; else we are done

-    cmp           xq, -16
+    cmp           xd, -16
    jle .loop_x
-    test          xq, xq
+    test          xd, xd
    jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
    jl .right_extend

    add       sumsqq, (384+16)*4
    add         sumq, (384+16)*2
    add         srcq, strideq
-    dec hd
+    dec           hd
    jg .loop_y
    RET

@ -418,7 +418,7 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
    shr        ylimd, 2
    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
 .loop_x:
-    lea           yd, [hd+ylimd+2]
+    lea           yd, [hq+ylimq+2]
    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
    test       edged, 4                             ; have_top
@ -720,9 +720,9 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
    punpckhbw    xm0, xm1

    ; when we reach this, xm0 contains left two px in highest words
-    cmp           xq, -16
+    cmp           xd, -16
    jle .loop_x
-    test          xq, xq
+    test          xd, xd
    jge .right_extend
 .partial_load_and_extend:
    vpbroadcastb  m3, [srcq-1]
@ -781,11 +781,11 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
    ; else if x < xlimd we extend from previous load (this implies have_right=0)
    ; else we are done

-    cmp           xq, -16
+    cmp           xd, -16
    jle .loop_x
-    test          xq, xq
+    test          xd, xd
    jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
    jl .right_extend

    add       sumsqq, (384+16)*4
@ -803,7 +803,7 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
    shr        ylimd, 2
    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
 .loop_x:
-    lea           yd, [hd+ylimd+2]
+    lea           yd, [hq+ylimq+2]
    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
    test       edged, 4                             ; have_top
--- a/third_party/dav1d/src/x86/looprestoration_ssse3.asm
+++ b/third_party/dav1d/src/x86/looprestoration_ssse3.asm
@ -725,7 +725,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
    punpckhbw    xm0, xm1

    ; when we reach this, m0 contains left two px in highest words
-    cmp           xq, -8
+    cmp           xd, -8
    jle .loop_x
 .partial_load_and_extend:
    movd          m3, [srcq-4]
@ -1299,9 +1299,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
    punpckhbw     m0, m1

    ; when we reach this, m0 contains left two px in highest words
-    cmp           xq, -8
+    cmp           xd, -8
    jle .loop_x
-    test          xq, xq
+    test          xd, xd
    jge .right_extend
 .partial_load_and_extend:
    XCHG_PIC_REG
@ -1394,11 +1394,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
    ; else if x < xlimd we extend from previous load (this implies have_right=0)
    ; else we are done

-    cmp           xq, -8
+    cmp           xd, -8
    jle .loop_x
-    test          xq, xq
+    test          xd, xd
    jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
    jl .right_extend

    add       sumsqq, (384+16)*4
--- a/third_party/dav1d/src/x86/mc.asm
+++ b/third_party/dav1d/src/x86/mc.asm
--- a/third_party/dav1d/src/x86/mc_init_tmpl.c
+++ b/third_party/dav1d/src/x86/mc_init_tmpl.c
@ -49,36 +49,52 @@ decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
 decl_mc_fn(dav1d_put_bilin_avx2);
 decl_mc_fn(dav1d_put_bilin_ssse3);

+decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_bilin_avx512icl);
 decl_mct_fn(dav1d_prep_bilin_avx2);
 decl_mct_fn(dav1d_prep_bilin_ssse3);

+decl_avg_fn(dav1d_avg_avx512icl);
 decl_avg_fn(dav1d_avg_avx2);
 decl_avg_fn(dav1d_avg_ssse3);
+decl_w_avg_fn(dav1d_w_avg_avx512icl);
 decl_w_avg_fn(dav1d_w_avg_avx2);
 decl_w_avg_fn(dav1d_w_avg_ssse3);
+decl_mask_fn(dav1d_mask_avx512icl);
 decl_mask_fn(dav1d_mask_avx2);
 decl_mask_fn(dav1d_mask_ssse3);
+decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_ssse3);
+decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
 decl_w_mask_fn(dav1d_w_mask_422_avx2);
+decl_w_mask_fn(dav1d_w_mask_444_avx512icl);
 decl_w_mask_fn(dav1d_w_mask_444_avx2);
 decl_blend_fn(dav1d_blend_avx2);
 decl_blend_fn(dav1d_blend_ssse3);
@ -162,10 +178,11 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
 #endif

+#if ARCH_X86_64
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
        return;

-#if BITDEPTH == 8 && ARCH_X86_64
+#if BITDEPTH == 8
    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
@ -202,5 +219,29 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    c->warp8x8t = dav1d_warp_affine_8x8t_avx2;

    c->emu_edge = dav1d_emu_edge_avx2;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+        return;
+
+#if BITDEPTH == 8
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx512icl);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx512icl);
+
+    c->avg = dav1d_avg_avx512icl;
+    c->w_avg = dav1d_w_avg_avx512icl;
+    c->mask = dav1d_mask_avx512icl;
+    c->w_mask[0] = dav1d_w_mask_444_avx512icl;
+    c->w_mask[1] = dav1d_w_mask_422_avx512icl;
+    c->w_mask[2] = dav1d_w_mask_420_avx512icl;
+#endif
 #endif
 }
--- a/third_party/dav1d/src/x86/mc_ssse3.asm
+++ b/third_party/dav1d/src/x86/mc_ssse3.asm
@ -1425,7 +1425,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
    jmp                  wq
 .h_w2:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
    movzx               mxd, mxb
 %endif
@ -1455,7 +1455,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
    RET
 .h_w4:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
    movzx               mxd, mxb
 %endif
@ -1564,16 +1564,16 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
 %if ARCH_X86_32
    movzx               mxd, ssb
    shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
    lea                 ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
 %else
 %assign stack_offset org_stack_offset
    WIN64_SPILL_XMM      16
    movzx               mxd, myb
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    lea                 myq, [base_reg+myq*8+subpel_filters-put_ssse3]
 %endif
    tzcnt               r6d, wd
@ -1850,14 +1850,18 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
    %assign stack_offset org_stack_offset
    cmp                  wd, 4
    jg .hv_w8
-    and                 mxd, 0xff
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
    dec                srcq
    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
 %if ARCH_X86_32
    movzx               mxd, ssb
    shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
    W32_RESTORE_SSQ
    lea                  r6, [ssq*3]
@ -1886,8 +1890,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
 %else
    movzx               mxd, myb
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
    ALLOC_STACK   mmsize*14, 14
    lea                ss3q, [ssq*3]
@ -2202,8 +2206,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
    movzx               mxd, ssb
    shr                 ssd, 16
-    cmp                  hd, 4
-    cmovle              ssd, mxd
+    cmp                  hd, 6
+    cmovs               ssd, mxd
    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
    mov                 ssq, ssmp
    ALLOC_STACK  -mmsize*13
@ -2243,8 +2247,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
    movzx               mxd, myb
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
    pshufd         subpelh0, m0, q0000
    pshufd         subpelh1, m0, q1111
@ -2511,7 +2515,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
    jmp                  wq
 .h_w4:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
    movzx               mxd, mxb
 %endif
@ -2635,15 +2639,15 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 .v:
 %if ARCH_X86_32
    mov                 mxd, myd
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
 %assign stack_offset org_stack_offset
    WIN64_SPILL_XMM      16
    movzx               mxd, myb
 %endif
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    lea                 myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
    mova                 m2, [base+pw_512]
    psrlw                m2, m2, 1 ; 0x0100
@ -2849,14 +2853,14 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
    %assign stack_offset org_stack_offset
    cmp                  wd, 4
    jg .hv_w8
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
 %if ARCH_X86_32
    mov                 mxd, myd
-    and                 mxd, 0xff
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    and                 mxd, 0x7f
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
    mov                  r5, r2; use as new base
 %define           base_reg  r5
@ -2885,8 +2889,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %else
    movzx               mxd, myb
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
    ALLOC_STACK   mmsize*14, 14
    lea            stride3q, [strideq*3]
@ -3101,11 +3105,11 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %define             accuv0  [rsp+mmsize*11]
 %define             accuv1  [rsp+mmsize*12]
    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
-    movzx               mxd, myw
-    and                 mxd, 0xff
+    mov                 mxd, myd
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    and                 mxd, 0x7f
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
    ALLOC_STACK  -mmsize*13
 %if STACK_ALIGNMENT < mmsize
@ -3150,8 +3154,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
    movzx               mxd, myb
    shr                 myd, 16
-    cmp                  hd, 4
-    cmovle              myd, mxd
+    cmp                  hd, 6
+    cmovs               myd, mxd
    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
    pshufd         subpelh0, m0, q0000
    pshufd         subpelh1, m0, q1111
@ -4743,9 +4747,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
    xor            reg_zero, reg_zero
    lea             reg_tmp, [ihq-1]
    cmp                  yq, ihq
-    cmovl           reg_tmp, yq
+    cmovs           reg_tmp, yq
    test                 yq, yq
-    cmovl           reg_tmp, reg_zero
+    cmovs           reg_tmp, reg_zero
 %if ARCH_X86_64
    imul            reg_tmp, sstrideq
    add                srcq, reg_tmp
@ -4758,9 +4762,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
    ; ref += iclip(x, 0, iw - 1)
    lea             reg_tmp, [iwq-1]
    cmp                  xq, iwq
-    cmovl           reg_tmp, xq
+    cmovs           reg_tmp, xq
    test                 xq, xq
-    cmovl           reg_tmp, reg_zero
+    cmovs           reg_tmp, reg_zero
    add             reg_src, reg_tmp
 %if ARCH_X86_32
    mov                srcm, reg_src
@ -4773,7 +4777,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
    lea       reg_bottomext, [yq+bhq]
    sub       reg_bottomext, ihq
    lea                  r3, [bhq-1]
-    cmovl     reg_bottomext, reg_zero
+    cmovs     reg_bottomext, reg_zero
    ;

    DEFINE_ARGS bw, bh, iw, ih, x, \
@ -4782,9 +4786,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \

    ; top_ext = iclip(-y, 0, bh - 1)
    neg             topextq
-    cmovl           topextq, reg_zero
+    cmovs           topextq, reg_zero
    cmp       reg_bottomext, bhq
-    cmovge    reg_bottomext, r3
+    cmovns    reg_bottomext, r3
    cmp             topextq, bhq
    cmovg           topextq, r3
 %if ARCH_X86_32
@ -4796,7 +4800,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
    lea        reg_rightext, [xq+bwq]
    sub        reg_rightext, iwq
    lea                  r2, [bwq-1]
-    cmovl      reg_rightext, reg_zero
+    cmovs      reg_rightext, reg_zero

    DEFINE_ARGS bw, bh, iw, ih, leftext, \
                topext, dst, dstride, src, sstride, \
@ -4804,14 +4808,14 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \

    ; left_ext = iclip(-x, 0, bw - 1)
    neg            leftextq
-    cmovl          leftextq, reg_zero
+    cmovs          leftextq, reg_zero
    cmp        reg_rightext, bwq
-    cmovge     reg_rightext, r2
+    cmovns     reg_rightext, r2
 %if ARCH_X86_32
    mov                 r3m, r1
 %endif
    cmp            leftextq, bwq
-    cmovge         leftextq, r2
+    cmovns         leftextq, r2

 %undef reg_zero
 %undef reg_tmp
--- a/third_party/dav1d/src/x86/msac.asm
+++ b/third_party/dav1d/src/x86/msac.asm
@ -67,7 +67,7 @@ struc msac
    .update_cdf: resd 1
 endstruc

-%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)

 SECTION .text

@ -167,7 +167,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
 %endif
    lea            t5, [t2+gprsize]
    cmp            t5, rcx
-    jg .refill_eob
+    ja .refill_eob
    mov            t2, [t2]
    lea           ecx, [t1+23]
    add           t1d, 16
@ -195,7 +195,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
    sub           ecx, t1d ; c
 .refill_eob_loop:
    cmp            t2, t5
-    jge .refill_eob_end    ; eob reached
+    jae .refill_eob_end    ; eob reached
    movzx         t1d, byte [t2]
    inc            t2
    shl            t1, cl
@ -240,7 +240,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
    pcmpeqw        m1, m2
    pmovmskb      eax, m1
    test          t3d, t3d
-    jz m(msac_decode_symbol_adapt4).renorm
+    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
    movzx         t3d, word [t1+t4*2]
    pcmpeqw        m2, m2
    mov           t2d, t3d
@ -257,7 +257,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
    paddw          m0, m2
    mova         [t1], m0
    mov     [t1+t4*2], t2w
-    jmp m(msac_decode_symbol_adapt4).renorm
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm

 cglobal msac_decode_symbol_adapt16, 0, 6, 6
    DECODE_SYMBOL_ADAPT_INIT
@ -330,7 +330,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
 %if WIN64
    add           rsp, 48
 %endif
-    jmp m(msac_decode_symbol_adapt4).renorm2
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2

 cglobal msac_decode_bool_adapt, 0, 6, 0
    movifnidn      t1, r1mp
@ -366,7 +366,7 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
 %endif
    not            t4
    test          t3d, t3d
-    jz m(msac_decode_symbol_adapt4).renorm3
+    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
 %if UNIX64 == 0
    push           t6
 %endif
@ -390,13 +390,13 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
 %if WIN64
    mov           t1d, [t7+msac.cnt]
    pop            t6
-    jmp m(msac_decode_symbol_adapt4).renorm4
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
 %else
 %if ARCH_X86_64 == 0
    pop            t5
    pop            t6
 %endif
-    jmp m(msac_decode_symbol_adapt4).renorm3
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
 %endif

 cglobal msac_decode_bool_equi, 0, 6, 0
@ -418,7 +418,7 @@ cglobal msac_decode_bool_equi, 0, 6, 0
 %if ARCH_X86_64 == 0
    movzx         eax, al
 %endif
-    jmp m(msac_decode_symbol_adapt4).renorm3
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3

 cglobal msac_decode_bool, 0, 6, 0
    movifnidn      t0, r0mp
@ -442,7 +442,7 @@ cglobal msac_decode_bool, 0, 6, 0
 %if ARCH_X86_64 == 0
    movzx         eax, al
 %endif
-    jmp m(msac_decode_symbol_adapt4).renorm3
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3

 %macro HI_TOK 1 ; update_cdf
 %if ARCH_X86_64 == 0
@ -598,3 +598,71 @@ cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
    HI_TOK          1
 .no_update_cdf:
    HI_TOK          0
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal msac_decode_symbol_adapt16, 3, 6, 6
+    lea           rax, [pw_0xff00]
+    vpbroadcastw   m2, [t0+msac.rng]
+    mova           m0, [t1]
+    vpbroadcastw   m3, [t0+msac.dif+6]
+    vbroadcasti128 m4, [rax]
+    mov           t3d, [t0+msac.update_cdf]
+    mov           t4d, t2d
+    not            t2
+%if STACK_ALIGNMENT < 32
+    mov            r5, rsp
+%if WIN64
+    and           rsp, ~31
+    sub           rsp, 40
+%else
+    and            r5, ~31
+    %define buf r5-32
+%endif
+%elif WIN64
+    sub           rsp, 64
+%else
+    %define buf rsp-56
+%endif
+    psrlw          m1, m0, 6
+    movd      [buf-4], xm2
+    pand           m2, m4
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    paddw          m1, [rax+t2*2]
+    mova        [buf], m1
+    pmaxuw         m1, m3
+    pcmpeqw        m1, m3
+    pmovmskb      eax, m1
+    test          t3d, t3d
+    jz .renorm
+    movzx         t3d, word [t1+t4*2]
+    pcmpeqw        m2, m2
+    lea           t2d, [t3+80]
+    shr           t2d, 4
+    cmp           t3d, 32
+    adc           t3d, 0
+    movd          xm3, t2d
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, xm3
+    paddw          m0, m2
+    mova         [t1], m0
+    mov     [t1+t4*2], t3w
+.renorm:
+    tzcnt         eax, eax
+    mov            t4, [t0+msac.dif]
+    movzx         t1d, word [buf+rax-0]
+    movzx         t2d, word [buf+rax-2]
+    shr           eax, 1
+%if WIN64
+%if STACK_ALIGNMENT < 32
+    mov           rsp, r5
+%else
+    add           rsp, 64
+%endif
+%endif
+    vzeroupper
+    jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
+%endif
--- a/third_party/dav1d/src/x86/msac.h
+++ b/third_party/dav1d/src/x86/msac.h
@ -39,10 +39,13 @@ unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
 unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
 unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);

+/* Needed for checkasm */
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+
 #if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
-#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
 #define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
 #endif

@ -50,4 +53,12 @@ unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
 #define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
 #define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2

+#if ARCH_X86_64
+#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+void dav1d_msac_init_x86(MsacContext *const s);
+
 #endif /* DAV1D_SRC_X86_MSAC_H */
--- a/third_party/dav1d/src/x86/msac_init.c
+++ b/third_party/dav1d/src/x86/msac_init.c
@ -0,0 +1,42 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/msac.h"
+#include "src/x86/msac.h"
+
+void dav1d_msac_init_x86(MsacContext *const s) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+    }
+
+    if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+    }
+}
+
--- a/third_party/dav1d/tests/checkasm/cdef.c
+++ b/third_party/dav1d/tests/checkasm/cdef.c
@ -35,72 +35,82 @@
 #include "src/levels.h"
 #include "src/cdef.h"

-static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
-    while (n--)
-        *buf++ = rnd() & bitdepth_max;
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+    return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
 }

-static void check_cdef_filter(const cdef_fn fn, const int w, const int h,
-                              const char *const name)
-{
-    ALIGN_STK_32(pixel, src, 10 * 16 + 8, );
-    ALIGN_STK_32(pixel, c_src, 10 * 16 + 8, ), *const c_src_ptr = c_src + 8;
-    ALIGN_STK_32(pixel, a_src, 10 * 16 + 8, ), *const a_src_ptr = a_src + 8;
-    ALIGN_STK_32(pixel, top, 16 * 2 + 8, ), *const top_ptr = top + 8;
-    pixel left[8][2];
+static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
+    const int fill_type = rnd() & 7;
+    if (fill_type == 0)
+        while (n--) /* check for cdef_filter underflows */
+            *buf++ = rnd() & 1;
+    else if (fill_type == 1)
+        while (n--) /* check for cdef_filter overflows */
+            *buf++ = bitdepth_max - (rnd() & 1);
+    else
+        while (n--)
+            *buf++ = rnd() & bitdepth_max;
+}
+
+static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
+    ALIGN_STK_64(pixel, c_src,   16 * 10 + 16, ), *const c_dst = c_src + 8;
+    ALIGN_STK_64(pixel, a_src,   16 * 10 + 16, ), *const a_dst = a_src + 8;
+    ALIGN_STK_64(pixel, top_buf, 16 *  2 + 16, ), *const top = top_buf + 8;
+    ALIGN_STK_16(pixel, left, 8,[2]);
+    const ptrdiff_t stride = 16 * sizeof(pixel);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
-                 pixel *const top[2], int pri_strength, int sec_strength,
+                 const pixel *top, int pri_strength, int sec_strength,
                 int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);

-    if (check_func(fn, "%s_%dbpc", name, BITDEPTH)) {
+    if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
        for (int dir = 0; dir < 8; dir++) {
-            for (enum CdefEdgeFlags edges = 0; edges <= 0xf; edges++) {
+            for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
 #if BITDEPTH == 16
                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
 #else
                const int bitdepth_max = 0xff;
 #endif
                const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
-                init_tmp(src, 10 * 16 + 8, bitdepth_max);
-                init_tmp(top, 16 * 2 + 8, bitdepth_max);
-                init_tmp((pixel *) left,8 * 2, bitdepth_max);

-                memcpy(a_src, src, (10 * 16 + 8) * sizeof(pixel));
-                memcpy(c_src, src, (10 * 16 + 8) * sizeof(pixel));
+                init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
+                init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
+                init_tmp((pixel *) left, 8 * 2, bitdepth_max);
+                memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));

                const int lvl = 1 + (rnd() % 62);
                const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
-                const int pri_strength = (lvl >> 2) << bitdepth_min_8;
+                int pri_strength = (lvl >> 2) << bitdepth_min_8;
                int sec_strength = lvl & 3;
                sec_strength += sec_strength == 3;
                sec_strength <<= bitdepth_min_8;
-                call_ref(c_src_ptr, 16 * sizeof(pixel), left,
-                         (pixel *[2]) { top_ptr, top_ptr + 16 },
-                         pri_strength, sec_strength, dir, damping, edges
-                         HIGHBD_TAIL_SUFFIX);
-                call_new(a_src_ptr, 16 * sizeof(pixel), left,
-                         (pixel *[2]) { top_ptr, top_ptr + 16 },
-                         pri_strength, sec_strength, dir, damping, edges
-                         HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_src, 16 * sizeof(pixel),
-                                     a_src, 16 * sizeof(pixel),
-                                     16, 10, "src");
-                checkasm_check_pixel(c_src + 16 * 10, 16 * sizeof(pixel),
-                                     a_src + 16 * 10, 16 * sizeof(pixel),
-                                     8, 1, "src last row");
-                bench_new(a_src_ptr, 16 * sizeof(pixel), left,
-                          (pixel *[2]) { top_ptr, top_ptr + 16 },
-                          pri_strength, sec_strength, dir, damping, edges
-                          HIGHBD_TAIL_SUFFIX);
+                call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
+                         dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, stride, left, top, pri_strength, sec_strength,
+                         dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
+                    fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
+                            pri_strength, sec_strength, dir, damping, to_binary(edges));
+                    return;
+                }
+                if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
+                    /* Benchmark a fixed set of cases to get consistent results:
+                     *  1) top/left edges and pri_strength only
+                     *  2) bottom/right edges and sec_strength only
+                     *  3) all edges and both pri_strength and sec_strength
+                     */
+                    pri_strength = (edges & 1) << bitdepth_min_8;
+                    sec_strength = (edges & 2) << bitdepth_min_8;
+                    bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
+                              dir, damping, edges HIGHBD_TAIL_SUFFIX);
+                }
            }
        }
    }
-    report(name);
 }

 static void check_cdef_direction(const cdef_dir_fn fn) {
-    ALIGN_STK_32(pixel, src, 8 * 8,);
+    ALIGN_STK_64(pixel, src, 8 * 8,);

    declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
                 HIGHBD_DECL_SUFFIX);
@ -129,11 +139,12 @@ static void check_cdef_direction(const cdef_dir_fn fn) {

 void bitfn(checkasm_check_cdef)(void) {
    Dav1dCdefDSPContext c;
-
    bitfn(dav1d_cdef_dsp_init)(&c);

    check_cdef_direction(c.dir);
-    check_cdef_filter(c.fb[0], 8, 8, "cdef_filter_8x8");
-    check_cdef_filter(c.fb[1], 4, 8, "cdef_filter_4x8");
-    check_cdef_filter(c.fb[2], 4, 4, "cdef_filter_4x4");
+
+    check_cdef_filter(c.fb[0], 8, 8);
+    check_cdef_filter(c.fb[1], 4, 8);
+    check_cdef_filter(c.fb[2], 4, 4);
+    report("cdef_filter");
 }
--- a/third_party/dav1d/tests/checkasm/checkasm.c
+++ b/third_party/dav1d/tests/checkasm/checkasm.c
@ -98,19 +98,15 @@ static const struct {
    unsigned flag;
 } cpus[] = {
 #if ARCH_X86
-    { "SSE",     "sse",    DAV1D_X86_CPU_FLAG_SSE },
-    { "SSE2",    "sse2",   DAV1D_X86_CPU_FLAG_SSE2 },
-    { "SSE3",    "sse3",   DAV1D_X86_CPU_FLAG_SSE3 },
-    { "SSSE3",   "ssse3",  DAV1D_X86_CPU_FLAG_SSSE3 },
-    { "SSE4.1",  "sse4",   DAV1D_X86_CPU_FLAG_SSE41 },
-    { "SSE4.2",  "sse42",  DAV1D_X86_CPU_FLAG_SSE42 },
-    { "AVX",     "avx",    DAV1D_X86_CPU_FLAG_AVX },
-    { "AVX2",    "avx2",   DAV1D_X86_CPU_FLAG_AVX2 },
-    { "AVX-512", "avx512", DAV1D_X86_CPU_FLAG_AVX512 },
+    { "SSE2",               "sse2",      DAV1D_X86_CPU_FLAG_SSE2 },
+    { "SSSE3",              "ssse3",     DAV1D_X86_CPU_FLAG_SSSE3 },
+    { "SSE4.1",             "sse4",      DAV1D_X86_CPU_FLAG_SSE41 },
+    { "AVX2",               "avx2",      DAV1D_X86_CPU_FLAG_AVX2 },
+    { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
 #elif ARCH_AARCH64 || ARCH_ARM
-    { "NEON",    "neon",   DAV1D_ARM_CPU_FLAG_NEON },
+    { "NEON",               "neon",      DAV1D_ARM_CPU_FLAG_NEON },
 #elif ARCH_PPC64LE
-    { "VSX",     "vsx",    DAV1D_PPC_CPU_FLAG_VSX },
+    { "VSX",                "vsx",       DAV1D_PPC_CPU_FLAG_VSX },
 #endif
    { 0 }
 };
@ -150,6 +146,9 @@ static struct {
    int bench_c;
    int verbose;
    int function_listing;
+#if ARCH_X86_64
+    void (*simd_warmup)(void);
+#endif
 } state;

 /* float compare support code */
@ -569,13 +568,26 @@ int main(int argc, char *argv[]) {

    fprintf(stderr, "checkasm: using random seed %u\n", state.seed);

+    dav1d_init_cpu();
+#if ARCH_X86_64
+    void checkasm_warmup_avx2(void);
+    void checkasm_warmup_avx512(void);
+    unsigned cpu_flags = dav1d_get_cpu_flags();
+    if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
+        state.simd_warmup = checkasm_warmup_avx512;
+    else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
+        state.simd_warmup = checkasm_warmup_avx2;
+    else
+        state.simd_warmup = NULL;
+    checkasm_simd_warmup();
+#endif
    check_cpu_flag(NULL, 0);
+
    if (state.function_listing) {
        print_functions(state.funcs);
    } else {
        for (int i = 0; cpus[i].flag; i++)
            check_cpu_flag(cpus[i].name, cpus[i].flag);
-
        if (!state.num_checked) {
            fprintf(stderr, "checkasm: no tests to perform\n");
        } else if (state.num_failed) {
@ -774,3 +786,11 @@ DEF_CHECKASM_CHECK_FUNC(uint8_t,  "%02x")
 DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
 DEF_CHECKASM_CHECK_FUNC(int16_t,  "%6d")
 DEF_CHECKASM_CHECK_FUNC(int32_t,  "%9d")
+
+#if ARCH_X86_64
+void checkasm_simd_warmup(void)
+{
+    if (state.simd_warmup)
+        state.simd_warmup();
+}
+#endif
--- a/third_party/dav1d/tests/checkasm/checkasm.h
+++ b/third_party/dav1d/tests/checkasm/checkasm.h
@ -193,12 +193,20 @@ void checkasm_checked_call(void *func, ...);
 * not guaranteed and false negatives is theoretically possible, but there
 * can never be any false positives. */
 void checkasm_stack_clobber(uint64_t clobber, ...);
+/* YMM and ZMM registers on x86 are turned off to save power when they haven't
+ * been used for some period of time. When they are used there will be a
+ * "warmup" period during which performance will be reduced and inconsistent
+ * which is problematic when trying to benchmark individual functions. We can
+ * work around this by periodically issuing "dummy" instructions that uses
+ * those registers to keep them powered on. */
+void checkasm_simd_warmup(void);
 #define declare_new(ret, ...)\
    ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\
    (void *)checkasm_checked_call;
 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
 #define call_new(...)\
    (checkasm_set_signal_handler_state(1),\
+     checkasm_simd_warmup(),\
     checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
--- a/third_party/dav1d/tests/checkasm/filmgrain.c
+++ b/third_party/dav1d/tests/checkasm/filmgrain.c
@ -49,29 +49,29 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {

    for (int i = 0; i < 4; i++) {
        if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
-            Dav1dFilmGrainData fg_data;
-            fg_data.seed = rnd() & 0xFFFF;
+            ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+            fg_data[0].seed = rnd() & 0xFFFF;

 #if BITDEPTH == 16
            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
 #endif

-            fg_data.grain_scale_shift = rnd() & 3;
-            fg_data.ar_coeff_shift = (rnd() & 3) + 6;
-            fg_data.ar_coeff_lag = i;
-            const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+            fg_data[0].grain_scale_shift = rnd() & 3;
+            fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+            fg_data[0].ar_coeff_lag = i;
+            const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
            for (int n = 0; n < num_y_pos; n++)
-                fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;

-            call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
-            call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+            call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
+            call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
            if (memcmp(grain_lut_c, grain_lut_a,
                       GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
            {
                fail();
            }

-            bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+            bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
        }
    }

@ -97,38 +97,38 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
                           "gen_grain_uv_ar%d_%dbpc_%s",
                           i, BITDEPTH, ss_name[layout_idx]))
            {
-                Dav1dFilmGrainData fg_data;
-                fg_data.seed = rnd() & 0xFFFF;
+                ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+                fg_data[0].seed = rnd() & 0xFFFF;

 #if BITDEPTH == 16
                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
 #endif

-                fg_data.num_y_points = rnd() & 1;
-                fg_data.grain_scale_shift = rnd() & 3;
-                fg_data.ar_coeff_shift = (rnd() & 3) + 6;
-                fg_data.ar_coeff_lag = i;
-                const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+                fg_data[0].num_y_points = rnd() & 1;
+                fg_data[0].grain_scale_shift = rnd() & 3;
+                fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data[0].ar_coeff_lag = i;
+                const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
                for (int n = 0; n < num_y_pos; n++)
-                    fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
-                dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
+                    fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);

                const int uv = rnd() & 1;
-                const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
+                const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
                for (int n = 0; n < num_uv_pos; n++)
-                    fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
-                if (!fg_data.num_y_points)
-                    fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
+                    fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
+                if (!fg_data[0].num_y_points)
+                    fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
                memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
                memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
-                call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
-                call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
+                call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
                int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
                for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
                    diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
                if (diff) fail();

-                bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
            }
        }
    }
@ -137,9 +137,9 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
 }

 static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, src, 128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
    const ptrdiff_t stride = 128 * sizeof(pixel);

    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@ -149,8 +149,8 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                 int bh, int row_num HIGHBD_DECL_SUFFIX);

    if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
-        Dav1dFilmGrainData fg_data;
-        fg_data.seed = rnd() & 0xFFFF;
+        ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
+        fg_data[0].seed = rnd() & 0xFFFF;

 #if BITDEPTH == 16
        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@ -160,23 +160,23 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {

        uint8_t scaling[SCALING_SIZE];
        entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
-        fg_data.grain_scale_shift = rnd() & 3;
-        fg_data.ar_coeff_shift = (rnd() & 3) + 6;
-        fg_data.ar_coeff_lag = rnd() & 3;
-        const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+        fg_data[0].grain_scale_shift = rnd() & 3;
+        fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+        fg_data[0].ar_coeff_lag = rnd() & 3;
+        const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
        for (int n = 0; n < num_y_pos; n++)
-            fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
-        dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
+            fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+        dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);

-        fg_data.num_y_points = 2 + (rnd() % 13);
-        const int pad = 0xff / fg_data.num_y_points;
-        for (int n = 0; n < fg_data.num_y_points; n++) {
-            fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
-            fg_data.y_points[n][0] += rnd() % pad;
-            fg_data.y_points[n][1] = rnd() & 0xff;
+        fg_data[0].num_y_points = 2 + (rnd() % 13);
+        const int pad = 0xff / fg_data[0].num_y_points;
+        for (int n = 0; n < fg_data[0].num_y_points; n++) {
+            fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+            fg_data[0].y_points[n][0] += rnd() % pad;
+            fg_data[0].y_points[n][1] = rnd() & 0xff;
        }
-        generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
-                         fg_data.num_y_points, scaling);
+        generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+                         fg_data[0].num_y_points, scaling);

        const int w = 1 + (rnd() & 127);
        const int h = 1 + (rnd() & 31);
@ -186,20 +186,20 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
        const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;

-        fg_data.clip_to_restricted_range = rnd() & 1;
-        fg_data.scaling_shift = (rnd() & 3) + 8;
-        for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
-             fg_data.overlap_flag++)
+        fg_data[0].clip_to_restricted_range = rnd() & 1;
+        fg_data[0].scaling_shift = (rnd() & 3) + 8;
+        for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+             fg_data[0].overlap_flag++)
        {
-            call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+            call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
                     row_num HIGHBD_TAIL_SUFFIX);
-            call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+            call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
                     row_num HIGHBD_TAIL_SUFFIX);

            checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
        }
-        fg_data.overlap_flag = 1;
-        bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
+        fg_data[0].overlap_flag = 1;
+        bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
                  row_num HIGHBD_TAIL_SUFFIX);
    }

@ -207,10 +207,10 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
 }

 static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, src, 128 * 32,);
-    ALIGN_STK_32(pixel, luma_src, 128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
+    ALIGN_STK_64(pixel, luma_src, 128 * 32,);
    const ptrdiff_t lstride = 128 * sizeof(pixel);

    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@ -231,9 +231,9 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                           "fguv_32x32xn_%dbpc_%s_csfl%d",
                           BITDEPTH, ss_name[layout_idx], csfl))
            {
-                Dav1dFilmGrainData fg_data;
+                ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);

-                fg_data.seed = rnd() & 0xFFFF;
+                fg_data[0].seed = rnd() & 0xFFFF;

 #if BITDEPTH == 16
                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@ -245,15 +245,18 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {

                uint8_t scaling[SCALING_SIZE];
                entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
-                fg_data.grain_scale_shift = rnd() & 3;
-                fg_data.ar_coeff_shift = (rnd() & 3) + 6;
-                fg_data.ar_coeff_lag = rnd() & 3;
-                const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+                fg_data[0].grain_scale_shift = rnd() & 3;
+                fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data[0].ar_coeff_lag = rnd() & 3;
+                const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
                for (int n = 0; n < num_y_pos; n++)
-                    fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
-                dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
+                    fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                const int num_uv_pos = num_y_pos + 1;
+                for (int n = 0; n < num_uv_pos; n++)
+                    fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
                dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
-                                                   &fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
+                                                   fg_data, uv_pl HIGHBD_TAIL_SUFFIX);

                const int w = 1 + (rnd() & (127 >> ss_x));
                const int h = 1 + (rnd() & (31 >> ss_y));
@ -268,47 +271,47 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;

                if (csfl) {
-                    fg_data.num_y_points = 2 + (rnd() % 13);
-                    const int pad = 0xff / fg_data.num_y_points;
-                    for (int n = 0; n < fg_data.num_y_points; n++) {
-                        fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
-                        fg_data.y_points[n][0] += rnd() % pad;
-                        fg_data.y_points[n][1] = rnd() & 0xff;
+                    fg_data[0].num_y_points = 2 + (rnd() % 13);
+                    const int pad = 0xff / fg_data[0].num_y_points;
+                    for (int n = 0; n < fg_data[0].num_y_points; n++) {
+                        fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+                        fg_data[0].y_points[n][0] += rnd() % pad;
+                        fg_data[0].y_points[n][1] = rnd() & 0xff;
                    }
-                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
-                                     fg_data.num_y_points, scaling);
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+                                     fg_data[0].num_y_points, scaling);
                } else {
-                    fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
-                    const int pad = 0xff / fg_data.num_uv_points[uv_pl];
-                    for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
-                        fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
-                        fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
-                        fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
+                    fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
+                    const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
+                    for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
+                        fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
+                        fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
+                        fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
                    }
-                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
-                                     fg_data.num_uv_points[uv_pl], scaling);
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
+                                     fg_data[0].num_uv_points[uv_pl], scaling);

-                    fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
-                    fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
-                    fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
+                    fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
                }

-                fg_data.clip_to_restricted_range = rnd() & 1;
-                fg_data.scaling_shift = (rnd() & 3) + 8;
-                fg_data.chroma_scaling_from_luma = csfl;
-                for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
-                     fg_data.overlap_flag++)
+                fg_data[0].clip_to_restricted_range = rnd() & 1;
+                fg_data[0].scaling_shift = (rnd() & 3) + 8;
+                fg_data[0].chroma_scaling_from_luma = csfl;
+                for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+                     fg_data[0].overlap_flag++)
                {
-                    call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+                    call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+                    call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);

                    checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
                }

-                fg_data.overlap_flag = 1;
-                bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
+                fg_data[0].overlap_flag = 1;
+                bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
                          row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
            }
        }
--- a/third_party/dav1d/tests/checkasm/ipred.c
+++ b/third_party/dav1d/tests/checkasm/ipred.c
@ -66,9 +66,9 @@ static const uint8_t z_angles[27] = {
 };

 static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, topleft_buf, 257,);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
    pixel *const topleft = topleft_buf + 128;

    declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@ -132,9 +132,9 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
 }

 static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
-    ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, luma, 32 * 32,);
+    ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, luma, 32 * 32,);

    declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
                 int w_pad, int h_pad, int cw, int ch);
@ -175,10 +175,10 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
 }

 static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
-    ALIGN_STK_32(int16_t, ac, 32 * 32,);
-    ALIGN_STK_32(pixel, topleft_buf, 257,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, ac, 32 * 32,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
    pixel *const topleft = topleft_buf + 128;

    declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@ -227,9 +227,9 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
 }

 static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
-    ALIGN_STK_32(uint8_t, idx, 64 * 64,);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(uint8_t, idx, 64 * 64,);
    ALIGN_STK_16(uint16_t, pal, 8,);

    declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
--- a/third_party/dav1d/tests/checkasm/itx.c
+++ b/third_party/dav1d/tests/checkasm/itx.c
@ -226,9 +226,9 @@ void bitfn(checkasm_check_itx)(void) {
    Dav1dInvTxfmDSPContext c;
    bitfn(dav1d_itx_dsp_init)(&c);

-    ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);

    static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
        TX_4X4,   RTX_4X8,  RTX_4X16,
--- a/third_party/dav1d/tests/checkasm/loopfilter.c
+++ b/third_party/dav1d/tests/checkasm/loopfilter.c
@ -95,8 +95,8 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
                         const int n_blks, const int lf_idx,
                         const int is_chroma, const int dir)
 {
-    ALIGN_STK_32(pixel, c_dst_mem, 128 * 16,);
-    ALIGN_STK_32(pixel, a_dst_mem, 128 * 16,);
+    ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
+    ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
--- a/third_party/dav1d/tests/checkasm/looprestoration.c
+++ b/third_party/dav1d/tests/checkasm/looprestoration.c
@ -43,10 +43,10 @@ static void init_tmp(pixel *buf, const ptrdiff_t stride,
    }
 }

-static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
    pixel left[64][4];

    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -58,7 +58,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {

    for (int pl = 0; pl < 2; pl++) {
        if (check_func(c->wiener, "wiener_%s_%dbpc",
-                       pl ? "chroma" : "luma", BITDEPTH))
+                       pl ? "chroma" : "luma", bpc))
        {
            int16_t filter[2][3], filter_v[7], filter_h[7];

@ -81,11 +81,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {

            const int base_w = 1 + (rnd() % 384);
            const int base_h = 1 + (rnd() & 63);
-#if BITDEPTH == 16
-            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
-            const int bitdepth_max = 0xff;
-#endif
+            const int bitdepth_max = (1 << bpc) - 1;

            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
@ -112,13 +108,12 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
                      256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
        }
    }
-    report("wiener");
 }

-static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
    pixel left[64][4];

    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -130,7 +125,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {

    for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
        if (check_func(c->selfguided, "selfguided_%s_%dbpc",
-                       sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", BITDEPTH))
+                       sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
        {
            int16_t sgr_wt[2];

@ -140,11 +135,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {

            const int base_w = 1 + (rnd() % 384);
            const int base_h = 1 + (rnd() & 63);
-#if BITDEPTH == 16
-            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
-            const int bitdepth_max = 0xff;
-#endif
+            const int bitdepth_max = (1 << bpc) - 1;

            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
@ -171,14 +162,24 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
                      256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
        }
    }
-    report("sgr");
 }

 void bitfn(checkasm_check_looprestoration)(void) {
-    Dav1dLoopRestorationDSPContext c;
-
-    bitfn(dav1d_loop_restoration_dsp_init)(&c);
-
-    check_wiener(&c);
-    check_sgr(&c);
+#if BITDEPTH == 16
+    const int bpc_min = 10, bpc_max = 12;
+#else
+    const int bpc_min = 8, bpc_max = 8;
+#endif
+    for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+        Dav1dLoopRestorationDSPContext c;
+        bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
+        check_wiener(&c, bpc);
+    }
+    report("wiener");
+    for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+        Dav1dLoopRestorationDSPContext c;
+        bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
+        check_sgr(&c, bpc);
+    }
+    report("sgr");
 }
--- a/third_party/dav1d/tests/checkasm/mc.c
+++ b/third_party/dav1d/tests/checkasm/mc.c
@ -55,9 +55,9 @@ static int mc_h_next(const int h) {
 }

 static void check_mc(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_32(pixel, c_dst,   128 * 128,);
-    ALIGN_STK_32(pixel, a_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
    const pixel *src = src_buf + 135 * 3 + 3;
    const ptrdiff_t src_stride = 135 * sizeof(pixel);

@ -118,9 +118,9 @@ static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
 }

 static void check_mct(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_32(int16_t, c_tmp, 128 * 128,);
-    ALIGN_STK_32(int16_t, a_tmp, 128 * 128,);
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
    const pixel *src = src_buf + 135 * 3 + 3;
    const ptrdiff_t src_stride = 135 * sizeof(pixel);

@ -173,9 +173,9 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
 }

 static void check_avg(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                 const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
@ -204,9 +204,9 @@ static void check_avg(Dav1dMCDSPContext *const c) {
 }

 static void check_w_avg(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                 const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
@ -236,10 +236,10 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
 }

 static void check_mask(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel,   c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel,   a_dst, 128 * 128,);
-    ALIGN_STK_32(uint8_t, mask,  128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst, 128 * 128,);
+    ALIGN_STK_64(uint8_t, mask,  128 * 128,);

    for (int i = 0; i < 128 * 128; i++)
        mask[i] = rnd() % 65;
@ -271,11 +271,11 @@ static void check_mask(Dav1dMCDSPContext *const c) {
 }

 static void check_w_mask(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel,   c_dst,  135 * 135,);
-    ALIGN_STK_32(pixel,   a_dst,  128 * 128,);
-    ALIGN_STK_32(uint8_t, c_mask, 128 * 128,);
-    ALIGN_STK_32(uint8_t, a_mask, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst,  135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst,  128 * 128,);
+    ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
+    ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                 const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
@ -321,10 +321,10 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
 }

 static void check_blend(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp, 32 * 32,);
-    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
-    ALIGN_STK_32(uint8_t, mask, 32 * 32,);
+    ALIGN_STK_64(pixel, tmp, 32 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(uint8_t, mask, 32 * 32,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h, const uint8_t *mask);
@ -357,9 +357,9 @@ static void check_blend(Dav1dMCDSPContext *const c) {
 }

 static void check_blend_v(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp,   32 * 128,);
-    ALIGN_STK_32(pixel, c_dst, 32 * 128,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 128,);
+    ALIGN_STK_64(pixel, tmp,   32 * 128,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 128,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 128,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h);
@ -391,9 +391,9 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
 }

 static void check_blend_h(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp,   128 * 32,);
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, tmp,   128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h);
@ -424,9 +424,9 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
 }

 static void check_warp8x8(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 15 * 15,);
-    ALIGN_STK_32(pixel, c_dst,    8 *  8,);
-    ALIGN_STK_32(pixel, a_dst,    8 *  8,);
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(pixel, c_dst,    8 *  8,);
+    ALIGN_STK_64(pixel, a_dst,    8 *  8,);
    int16_t abcd[4];
    const pixel *src = src_buf + 15 * 3 + 3;
    const ptrdiff_t dst_stride =  8 * sizeof(pixel);
@ -462,9 +462,9 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
 }

 static void check_warp8x8t(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 15 * 15,);
-    ALIGN_STK_32(int16_t, c_tmp,  8 *  8,);
-    ALIGN_STK_32(int16_t, a_tmp,  8 *  8,);
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(int16_t, c_tmp,  8 *  8,);
+    ALIGN_STK_64(int16_t, a_tmp,  8 *  8,);
    int16_t abcd[4];
    const pixel *src = src_buf + 15 * 3 + 3;
    const ptrdiff_t src_stride = 15 * sizeof(pixel);
@ -534,9 +534,9 @@ static void random_offset_for_edge(int *const x, int *const y,
 }

 static void check_emuedge(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 135 * 192,);
-    ALIGN_STK_32(pixel, a_dst, 135 * 192,);
-    ALIGN_STK_32(pixel, src,   160 * 160,);
+    ALIGN_STK_64(pixel, c_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, a_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, src,   160 * 160,);

    for (int i = 0; i < 160 * 160; i++)
        src[i] = rnd() & ((1U << BITDEPTH) - 1);
--- a/third_party/dav1d/tests/checkasm/msac.c
+++ b/third_party/dav1d/tests/checkasm/msac.c
@ -258,6 +258,12 @@ void checkasm_check_msac(void) {
        c.bool           = dav1d_msac_decode_bool_sse2;
        c.hi_tok         = dav1d_msac_decode_hi_tok_sse2;
    }
+
+#if ARCH_X86_64
+    if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+    }
+#endif
 #endif

    uint8_t buf[BUF_SIZE];
--- a/third_party/dav1d/tests/checkasm/x86/checkasm.asm
+++ b/third_party/dav1d/tests/checkasm/x86/checkasm.asm
@ -170,6 +170,19 @@ cglobal checked_call, 2,15,16,max_args*8+8
 .ok:
    RET

+; trigger a warmup of vector units
+%macro WARMUP 0
+cglobal warmup, 0, 0
+    xorps   m0, m0
+    mulps   m0, m0
+    RET
+%endmacro
+
+INIT_YMM avx2
+WARMUP
+INIT_ZMM avx512
+WARMUP
+
 %else

 ; just random numbers to reduce the chance of incidental match
--- a/third_party/dav1d/tests/libfuzzer/meson.build
+++ b/third_party/dav1d/tests/libfuzzer/meson.build
@ -0,0 +1,98 @@
+# Copyright © 2020, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d fuzzing binaries
+#
+
+dav1d_fuzzer_sources =  files('dav1d_fuzzer.c')
+fuzzer_ldflags = []
+fuzzer_link_lang = {}
+
+if get_option('fuzzer_ldflags') != ''
+    fuzzer_ldflags += [get_option('fuzzer_ldflags')]
+endif
+
+if fuzzing_engine == 'none'
+    dav1d_fuzzer_sources += files('main.c')
+elif fuzzing_engine == 'libfuzzer'
+    fuzzer_ldflags += ['-fsanitize=fuzzer']
+elif fuzzing_engine == 'oss-fuzz'
+    # libFuzzingEngine needs c++
+    add_languages('cpp')
+    fuzzer_link_lang = {'link_language': 'cpp'}
+endif
+
+dav1d_fuzzer = executable('dav1d_fuzzer',
+    dav1d_fuzzer_sources,
+    include_directories: dav1d_inc_dirs,
+    c_args: [stackalign_flag, stackrealign_flag],
+    link_args: fuzzer_ldflags,
+    link_with : libdav1d,
+    build_by_default: true,
+    dependencies : [thread_dependency],
+    kwargs: fuzzer_link_lang
+    )
+
+dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
+    dav1d_fuzzer_sources,
+    include_directories: dav1d_inc_dirs,
+    c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
+    link_args: fuzzer_ldflags,
+    link_with : libdav1d,
+    build_by_default: true,
+    dependencies : [thread_dependency],
+    kwargs: fuzzer_link_lang
+    )
+
+objcopy = find_program('objcopy',
+                       required: false)
+if (objcopy.found() and
+    not get_option('b_lto') and
+    get_option('default_library') == 'static' and
+    cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
+
+    libdav1d_af = custom_target('libdav1d_af',
+                                input: libdav1d,
+                                output: 'libdav1d_af.a',
+                                depends: libdav1d,
+                                command: [objcopy,
+                                          '--redefine-sym', 'malloc=__wrap_malloc',
+                                          '--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
+                                          '--redefine-sym', 'pthread_create=__wrap_pthread_create',
+                                          '--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
+                                          '--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
+                                          '@INPUT@', '@OUTPUT@'])
+
+    dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
+        dav1d_fuzzer_sources + ['alloc_fail.c'],
+        include_directories: dav1d_inc_dirs,
+        c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
+        link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
+        link_depends: libdav1d_af,
+        build_by_default: false,
+        dependencies : [thread_dependency],
+        kwargs: fuzzer_link_lang
+        )
+endif
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -90,76 +90,20 @@ if is_asm_enabled
        include_directories: dav1d_inc_dirs,
        c_args: [stackalign_flag, stackrealign_flag],
        build_by_default: false,
-        dependencies : [thread_dependency, rt_dependency, m_lib],
+        dependencies : [
+            thread_dependency,
+            rt_dependency,
+            libdl_dependency,
+            m_lib,
+            ],
        )

    test('checkasm', checkasm, is_parallel: false)
 endif

-dav1d_fuzzer_sources =  files('libfuzzer/dav1d_fuzzer.c')
-fuzzer_ldflags = []
-
-if get_option('fuzzer_ldflags') != ''
-    fuzzer_ldflags += [get_option('fuzzer_ldflags')]
-endif
-
-if fuzzing_engine == 'none'
-    dav1d_fuzzer_sources += files('libfuzzer/main.c')
-elif fuzzing_engine == 'libfuzzer'
-    fuzzer_ldflags += ['-fsanitize=fuzzer']
-elif fuzzing_engine == 'oss-fuzz'
-    # libFuzzingEngine needs libc++
-    fuzzer_ldflags += ['-lc++']
-endif
-
-dav1d_fuzzer = executable('dav1d_fuzzer',
-    dav1d_fuzzer_sources,
-    include_directories: dav1d_inc_dirs,
-    c_args: [stackalign_flag, stackrealign_flag],
-    link_args: fuzzer_ldflags,
-    link_with : libdav1d,
-    build_by_default: true,
-    dependencies : [thread_dependency],
-    )
-
-dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
-    dav1d_fuzzer_sources,
-    include_directories: dav1d_inc_dirs,
-    c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
-    link_args: fuzzer_ldflags,
-    link_with : libdav1d,
-    build_by_default: true,
-    dependencies : [thread_dependency],
-    )
-
-objcopy = find_program('objcopy',
-                       required: false)
-if (objcopy.found() and
-    not get_option('b_lto') and
-    get_option('default_library') == 'static' and
-    cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
-
-    libdav1d_af = custom_target('libdav1d_af',
-                                input: libdav1d,
-                                output: 'libdav1d_af.a',
-                                depends: libdav1d,
-                                command: [objcopy,
-                                          '--redefine-sym', 'malloc=__wrap_malloc',
-                                          '--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
-                                          '--redefine-sym', 'pthread_create=__wrap_pthread_create',
-                                          '--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
-                                          '--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
-                                          '@INPUT@', '@OUTPUT@'])
-
-    dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
-        dav1d_fuzzer_sources + ['libfuzzer/alloc_fail.c'],
-        include_directories: dav1d_inc_dirs,
-        c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
-        link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
-        link_depends: libdav1d_af,
-        build_by_default: false,
-        dependencies : [thread_dependency],
-        )
+# fuzzing binaries
+if meson.version().version_compare('>=0.49')
+    subdir('libfuzzer')
 endif

 # Include dav1d test data repository with additional tests
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@ -113,18 +113,24 @@ static void synchronize(const int realtime, const unsigned cache,
 static void print_stats(const int istty, const unsigned n, const unsigned num,
                        const uint64_t elapsed, const double i_fps)
 {
-    if (istty) fputs("\r", stderr);
-    const double d_fps = 1e9 * n / elapsed;
-    const double speed = d_fps / i_fps;
-    if (num == 0xFFFFFFFF) {
-        fprintf(stderr, "Decoded %u frames", n);
-    } else {
-        fprintf(stderr, "Decoded %u/%u frames (%.1lf%%)", n, num,
-                100.0 * n / num);
+    char buf[80], *b = buf, *const end = buf + 80;
+
+    if (istty)
+        *b++ = '\r';
+    if (num == 0xFFFFFFFF)
+        b += snprintf(b, end - b, "Decoded %u frames", n);
+    else
+        b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
+                      n, num, 100.0 * n / num);
+    if (i_fps && b < end) {
+        const double d_fps = 1e9 * n / elapsed;
+        const double speed = d_fps / i_fps;
+        b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
+                      d_fps, i_fps, speed);
    }
-    if (i_fps)
-        fprintf(stderr, " - %.2lf/%.2lf fps (%.2lfx)", d_fps, i_fps, speed);
-    if (!istty) fputs("\n", stderr);
+    if (!istty)
+        strcpy(b > end - 2 ? end - 2 : b, "\n");
+    fputs(buf, stderr);
 }

 int main(const int argc, char *const *const argv) {
@ -149,8 +155,6 @@ int main(const int argc, char *const *const argv) {
        return EXIT_FAILURE;
    }

-    init_demuxers();
-    init_muxers();
    parse(argc, argv, &cli_settings, &lib_settings);

    if ((res = input_open(&in, cli_settings.demuxer,
--- a/third_party/dav1d/tools/dav1d_cli_parse.c
+++ b/third_party/dav1d/tools/dav1d_cli_parse.c
@ -86,7 +86,7 @@ static const struct option long_opts[] = {
 #define ALLOWED_CPU_MASKS " or 'neon'"
 #elif ARCH_X86
 #define ALLOWED_CPU_MASKS \
-    ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512'"
+    ", 'sse2', 'ssse3', 'sse41', 'avx2', 'avx512' or 'avx512icl'"
 #else
 #define ALLOWED_CPU_MASKS "not yet implemented for this architecture"
 #endif
@ -176,15 +176,11 @@ typedef struct EnumParseTable {

 #if ARCH_X86
 enum CpuMask {
-    X86_CPU_MASK_SSE    = DAV1D_X86_CPU_FLAG_SSE,
-    X86_CPU_MASK_SSE2   = DAV1D_X86_CPU_FLAG_SSE2   | X86_CPU_MASK_SSE,
-    X86_CPU_MASK_SSE3   = DAV1D_X86_CPU_FLAG_SSE3   | X86_CPU_MASK_SSE2,
-    X86_CPU_MASK_SSSE3  = DAV1D_X86_CPU_FLAG_SSSE3  | X86_CPU_MASK_SSE3,
-    X86_CPU_MASK_SSE41  = DAV1D_X86_CPU_FLAG_SSE41  | X86_CPU_MASK_SSSE3,
-    X86_CPU_MASK_SSE42  = DAV1D_X86_CPU_FLAG_SSE42  | X86_CPU_MASK_SSE41,
-    X86_CPU_MASK_AVX    = DAV1D_X86_CPU_FLAG_AVX    | X86_CPU_MASK_SSE42,
-    X86_CPU_MASK_AVX2   = DAV1D_X86_CPU_FLAG_AVX2   | X86_CPU_MASK_AVX,
-    X86_CPU_MASK_AVX512 = DAV1D_X86_CPU_FLAG_AVX512 | X86_CPU_MASK_AVX2,
+    X86_CPU_MASK_SSE2      = DAV1D_X86_CPU_FLAG_SSE2,
+    X86_CPU_MASK_SSSE3     = DAV1D_X86_CPU_FLAG_SSSE3     | X86_CPU_MASK_SSE2,
+    X86_CPU_MASK_SSE41     = DAV1D_X86_CPU_FLAG_SSE41     | X86_CPU_MASK_SSSE3,
+    X86_CPU_MASK_AVX2      = DAV1D_X86_CPU_FLAG_AVX2      | X86_CPU_MASK_SSE41,
+    X86_CPU_MASK_AVX512ICL = DAV1D_X86_CPU_FLAG_AVX512ICL | X86_CPU_MASK_AVX2,
 };
 #endif

@ -192,11 +188,11 @@ static const EnumParseTable cpu_mask_tbl[] = {
 #if ARCH_AARCH64 || ARCH_ARM
    { "neon", DAV1D_ARM_CPU_FLAG_NEON },
 #elif ARCH_X86
-    { "sse2",   X86_CPU_MASK_SSE2 },
-    { "ssse3",  X86_CPU_MASK_SSSE3 },
-    { "sse41",  X86_CPU_MASK_SSE41 },
-    { "avx2",   X86_CPU_MASK_AVX2 },
-    { "avx512", X86_CPU_MASK_AVX512 },
+    { "sse2",      X86_CPU_MASK_SSE2 },
+    { "ssse3",     X86_CPU_MASK_SSSE3 },
+    { "sse41",     X86_CPU_MASK_SSE41 },
+    { "avx2",      X86_CPU_MASK_AVX2 },
+    { "avx512icl", X86_CPU_MASK_AVX512ICL },
 #endif
    { 0 },
 };
--- a/third_party/dav1d/tools/input/input.c
+++ b/third_party/dav1d/tools/input/input.c
@ -43,21 +43,15 @@ struct DemuxerContext {
    const Demuxer *impl;
 };

-#define MAX_NUM_DEMUXERS 3
-static const Demuxer *demuxers[MAX_NUM_DEMUXERS];
-static int num_demuxers = 0;
-
-#define register_demuxer(impl) { \
-    extern const Demuxer impl; \
-    assert(num_demuxers < MAX_NUM_DEMUXERS); \
-    demuxers[num_demuxers++] = &impl; \
-}
-
-void init_demuxers(void) {
-    register_demuxer(ivf_demuxer);
-    register_demuxer(annexb_demuxer);
-    register_demuxer(section5_demuxer);
-}
+extern const Demuxer ivf_demuxer;
+extern const Demuxer annexb_demuxer;
+extern const Demuxer section5_demuxer;
+static const Demuxer *const demuxers[] = {
+    &ivf_demuxer,
+    &annexb_demuxer,
+    &section5_demuxer,
+    NULL
+};

 int input_open(DemuxerContext **const c_out,
               const char *const name, const char *const filename,
@ -68,19 +62,19 @@ int input_open(DemuxerContext **const c_out,
    int res, i;

    if (name) {
-        for (i = 0; i < num_demuxers; i++) {
+        for (i = 0; demuxers[i]; i++) {
            if (!strcmp(demuxers[i]->name, name)) {
                impl = demuxers[i];
                break;
            }
        }
-        if (i == num_demuxers) {
+        if (!demuxers[i]) {
            fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name);
            return DAV1D_ERR(ENOPROTOOPT);
        }
    } else {
        int probe_sz = 0;
-        for (i = 0; i < num_demuxers; i++)
+        for (i = 0; demuxers[i]; i++)
            probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
        uint8_t *const probe_data = malloc(probe_sz);
        if (!probe_data) {
@ -96,14 +90,14 @@ int input_open(DemuxerContext **const c_out,
            return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
        }

-        for (i = 0; i < num_demuxers; i++) {
+        for (i = 0; demuxers[i]; i++) {
            if (demuxers[i]->probe(probe_data)) {
                impl = demuxers[i];
                break;
            }
        }
        free(probe_data);
-        if (i == num_demuxers) {
+        if (!demuxers[i]) {
            fprintf(stderr,
                    "Failed to probe demuxer for file %s\n",
                    filename);
@ -111,11 +105,10 @@ int input_open(DemuxerContext **const c_out,
        }
    }

-    if (!(c = malloc(sizeof(DemuxerContext) + impl->priv_data_size))) {
+    if (!(c = calloc(1, sizeof(DemuxerContext) + impl->priv_data_size))) {
        fprintf(stderr, "Failed to allocate memory\n");
        return DAV1D_ERR(ENOMEM);
    }
-    memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
    c->impl = impl;
    c->data = (DemuxerPriv *) &c[1];
    if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
--- a/Показать больше
+++ b/Показать больше