Bug 1916282 - Update libdav1d to 79db1624878fa0f37841ddc2caf86f06738ae275 r=media-playback-reviewers,padenot

This patch updates the libdav1d source by running `./mach vendor media/libdav1d/moz.yaml` Differential Revision: https://phabricator.services.mozilla.com/D221340
2024-09-09 18:17:56 +00:00 · 2024-09-09 18:17:56 +00:00 · 1498592440
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 5ef6b241f05a2b9058b58136da4b25842aefba96 (2024-08-04T17:55:20.000-04:00).
+  release: 79db1624878fa0f37841ddc2caf86f06738ae275 (2024-09-06T09:04:24.000+00:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 5ef6b241f05a2b9058b58136da4b25842aefba96
+  revision: 79db1624878fa0f37841ddc2caf86f06738ae275

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "5ef6b241f05a2b9058b58136da4b25842aefba96"
+#define DAV1D_VERSION "79db1624878fa0f37841ddc2caf86f06738ae275"
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@ -189,9 +189,13 @@ static inline int clzll(const unsigned long long mask) {
 #ifndef static_assert
 #define CHECK_OFFSET(type, field, name) \
    struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
+#define CHECK_SIZE(type, size) \
+    struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; }
 #else
 #define CHECK_OFFSET(type, field, name) \
    static_assert(name == offsetof(type, field), #field)
+#define CHECK_SIZE(type, size) \
+    static_assert(size == sizeof(type), #type)
 #endif

 #ifdef _MSC_VER
--- a/third_party/dav1d/include/compat/getopt.h
+++ b/third_party/dav1d/include/compat/getopt.h
@ -13,7 +13,9 @@
 #define __GETOPT_H__

 /* All the headers include this file. */
+#ifdef _WIN32
 #include <crtdefs.h>
+#endif

 #ifdef __cplusplus
 extern "C" {
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -31,10 +31,10 @@
 #include <errno.h>
 #include <stdarg.h>

-#include "common.h"
-#include "picture.h"
-#include "data.h"
-#include "version.h"
+#include "dav1d/common.h"
+#include "dav1d/picture.h"
+#include "dav1d/data.h"
+#include "dav1d/version.h"

 #ifdef __cplusplus
 extern "C" {
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -157,6 +157,12 @@ else
    if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
        cdata.set('HAVE_POSIX_MEMALIGN', 1)
    endif
+    if cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+        cdata.set('HAVE_MEMALIGN', 1)
+    endif
+    if cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
+        cdata.set('HAVE_ALIGNED_ALLOC', 1)
+    endif
 endif

 # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
@ -209,6 +215,10 @@ if host_machine.cpu_family().startswith('wasm')
    stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
 endif

+if cc.check_header('sys/types.h')
+    cdata.set('HAVE_SYS_TYPES_H', 1)
+endif
+
 if cc.check_header('unistd.h')
    cdata.set('HAVE_UNISTD_H', 1)
 endif
@ -259,6 +269,12 @@ endif
 if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
    cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
 endif
+if cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
+    cdata.set('HAVE_PTHREAD_SETNAME_NP', 1)
+endif
+if cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
+    cdata.set('HAVE_PTHREAD_SET_NAME_NP', 1)
+endif

 if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
    cdata.set('HAVE_C11_GENERIC', 1)
--- a/third_party/dav1d/src/arm/64/filmgrain.S
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@ -884,12 +884,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
 .else
        add             x4,  x1,  #FGD_AR_COEFFS_UV
 .endif
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
        add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
        neg             v31.8h,  v31.8h

 .ifc \type, uv_444
@ -1075,13 +1075,14 @@ L(generate_grain_\type\()_lag3):
        ldp             x30, x19, [sp], #96
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm

 gen_grain_82 y
@ -1118,12 +1119,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
        ldr             w2,  [x1, #FGD_SEED]
        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
        add             x4,  x1,  #FGD_AR_COEFFS_UV
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
        add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
        neg             v31.8h,  v31.8h

        cmp             w13, #0
@ -1272,13 +1273,14 @@ L(generate_grain_\type\()_lag3):
        ldp             x30, x19, [sp], #96
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm

 gen_grain_44 uv_420
@ -1407,18 +1409,18 @@ function fgy_32x32_8bpc_neon, export=1
        add_offset      x5,  w6,  x10, x5,  x9

        ldr             w11, [sp, #24]         // type
-        adr             x13, L(fgy_loop_tbl)
+        movrel          x13, fgy_loop_tbl

        add             x4,  x12, #32          // grain_lut += FG_BLOCK_SIZE * bx
        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        tst             w11, #1
-        ldrh            w11, [x13, w11, uxtw #1]
+        ldrsw           x11, [x13, w11, uxtw #2]

        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
        add             x8,  x8,  #32          // grain_lut += FG_BLOCK_SIZE * bx

-        sub             x11, x13, w11, uxtw
+        add             x11, x13, x11

        b.eq            1f
        // y overlap
@ -1555,14 +1557,15 @@ L(loop_\ox\oy):
        fgy             0, 1
        fgy             1, 0
        fgy             1, 1
-
-L(fgy_loop_tbl):
-        .hword L(fgy_loop_tbl) - L(loop_00)
-        .hword L(fgy_loop_tbl) - L(loop_01)
-        .hword L(fgy_loop_tbl) - L(loop_10)
-        .hword L(fgy_loop_tbl) - L(loop_11)
 endfunc

+jumptable fgy_loop_tbl
+        .word L(loop_00) - fgy_loop_tbl
+        .word L(loop_01) - fgy_loop_tbl
+        .word L(loop_10) - fgy_loop_tbl
+        .word L(loop_11) - fgy_loop_tbl
+endjumptable
+
 // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
 //                                     const pixel *const src,
 //                                     const ptrdiff_t stride,
@ -1646,11 +1649,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
        ldr             w13, [sp, #64]         // type

        movrel          x16, overlap_coeffs_\sx
-        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+        movrel          x14, fguv_loop_sx\sx\()_tbl

        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
        tst             w13, #1
-        ldrh            w13, [x14, w13, uxtw #1]
+        ldrsw           x13, [x14, w13, uxtw #2]

        b.eq            1f
        // y overlap
@ -1658,7 +1661,7 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
        mov             w9,  #(2 >> \sy)

 1:
-        sub             x13, x14, w13, uxtw
+        add             x13, x14, x13

 .if \sy
        movi            v25.16b, #23
@ -1848,18 +1851,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
        ldr             x30,      [sp], #32
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(fguv_loop_sx0_tbl):
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc

+jumptable fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
+endjumptable
+
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@ -1997,14 +2001,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
        ldr             x30,      [sp], #32
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(fguv_loop_sx1_tbl):
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
 endfunc
+
+jumptable fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
+endjumptable
--- a/third_party/dav1d/src/arm/64/filmgrain16.S
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
        add             x4,  x1,  #FGD_AR_COEFFS_UV
 .endif
        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
        add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
        neg             v31.8h,  v31.8h

 .ifc \type, uv_444
@ -945,13 +945,14 @@ L(generate_grain_\type\()_lag3):
        ldp             x30, x19, [sp], #96
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm

 gen_grain_82 y
@ -991,12 +992,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
        add             x4,  x1,  #FGD_AR_COEFFS_UV
        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
        add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
        neg             v31.8h,  v31.8h

        cmp             w13, #0
@ -1155,13 +1156,14 @@ L(generate_grain_\type\()_lag3):
        ldp             x30, x19, [sp], #96
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm

 gen_grain_44 uv_420
@ -1306,18 +1308,18 @@ function fgy_32x32_16bpc_neon, export=1
        add_offset      x5,  w6,  x10, x5,  x9

        ldr             w11, [sp, #88]         // type
-        adr             x13, L(fgy_loop_tbl)
+        movrel          x13, fgy_loop_tbl

        add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        tst             w11, #1
-        ldrh            w11, [x13, w11, uxtw #1]
+        ldrsw           x11, [x13, w11, uxtw #2]

        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
        add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx

-        sub             x11, x13, w11, uxtw
+        add             x11, x13, x11

        b.eq            1f
        // y overlap
@ -1480,14 +1482,15 @@ L(loop_\ox\oy):
        fgy             0, 1
        fgy             1, 0
        fgy             1, 1
-
-L(fgy_loop_tbl):
-        .hword L(fgy_loop_tbl) - L(loop_00)
-        .hword L(fgy_loop_tbl) - L(loop_01)
-        .hword L(fgy_loop_tbl) - L(loop_10)
-        .hword L(fgy_loop_tbl) - L(loop_11)
 endfunc

+jumptable fgy_loop_tbl
+        .word L(loop_00) - fgy_loop_tbl
+        .word L(loop_01) - fgy_loop_tbl
+        .word L(loop_10) - fgy_loop_tbl
+        .word L(loop_11) - fgy_loop_tbl
+endjumptable
+
 // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
 //                                      const pixel *const src,
 //                                      const ptrdiff_t stride,
@ -1589,11 +1592,11 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
        ldr             w13, [sp, #112]        // type

        movrel          x16, overlap_coeffs_\sx
-        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+        movrel          x14, fguv_loop_sx\sx\()_tbl

        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
        tst             w13, #1
-        ldrh            w13, [x14, w13, uxtw #1]
+        ldrsw           x13, [x14, w13, uxtw #2]

        b.eq            1f
        // y overlap
@ -1601,7 +1604,7 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
        mov             w9,  #(2 >> \sy)

 1:
-        sub             x13, x14, w13, uxtw
+        add             x13, x14, x13

 .if \sy
        movi            v25.8h,  #23
@ -1818,18 +1821,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
        ldr             x30,      [sp], #80
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(fguv_loop_sx0_tbl):
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc

+jumptable fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
+endjumptable
+
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@ -1984,14 +1988,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
        ldr             x30,      [sp], #80
        AARCH64_VALIDATE_LINK_REGISTER
        ret
-
-L(fguv_loop_sx1_tbl):
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
 endfunc
+
+jumptable fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
+endjumptable
--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
--- a/third_party/dav1d/src/arm/64/ipred16.S
+++ b/third_party/dav1d/src/arm/64/ipred16.S
--- a/third_party/dav1d/src/arm/64/looprestoration_common.S
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@ -28,14 +28,77 @@
 #include "src/arm/asm.S"
 #include "util.S"

+// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
+// In the comments, let RefTable denote the original, reference table.
+const x_by_x_tables
+// RangeMins
+//
+// Min(RefTable[i*8:i*8+8])
+// First two values are zeroed.
+//
+// Lookup using RangeMins[(x >> 3)]
+        .byte 0,  0, 11,  8,  6,  5,  5,  4,  4,  3,  3,  3,  2,  2,  2,  2
+        .byte 2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
+
+// DiffMasks
+//
+// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
+// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
+// RefTable changes at that particular index.
+// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
+// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
+//
+// Lookup using DiffMasks[(x >> 3)]
+        .byte 0x00, 0x00, 0xD4, 0x44
+        .byte 0x42, 0x04, 0x00, 0x00
+        .byte 0x00, 0x80, 0x00, 0x00
+        .byte 0x04, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x40, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x02
+// Binary form:
+// 0b00000000, 0b00000000, 0b11010100, 0b01000100
+// 0b01000010, 0b00000100, 0b00000000, 0b00000000
+// 0b00000000, 0b10000000, 0b00000000, 0b00000000
+// 0b00000100, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b01000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000010
+
+// RefLo
+//
+// RefTable[0:16]
+//      i.e. First 16 elements of the original table.
+// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
+//
+// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
+        .byte 255, 128,  85,  64,  51,  43,  37,  32, 28,  26,  23,  21,  20,  18,  17,  16
+
+// Pseudo assembly
+//
+// hi_bits = x >> 3
+// tbl             ref,    {RefLo}, x
+// tbl             diffs,  {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
+// tbl             min,    {RangeMins[0:16], RangeMins[16:32]}, hi_bits
+// lo_bits = x & 0x7
+// diffs = diffs << lo_bits
+// ref = ref + min
+// integral = popcnt(diffs)
+// ref = ref + integral
+// return ref
+endconst
+
 // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
 //                               int32_t *AA, int16_t *BB,
 //                               const int w, const int s,
 //                               const int bitdepth_max);
 function sgr_box3_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x30]!
+        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]

        add             w4,  w4,  #2
        clz             w9,  w6        // bitdepth_max
@ -49,41 +112,176 @@ function sgr_box3_vert_neon, export=1
        movi            v31.4s,   #9   // n

        sub             w9,  w9,  #24  // -bitdepth_min_8
-        movrel          x12, X(sgr_x_by_x)
+        movrel          x12, x_by_x_tables
        mov             w13, #455      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
+        movi            v22.16b, #0x7
+        ldr             q23, [x12, #64] //RefLo
        dup             v6.8h,    w9   // -bitdepth_min_8
-        movi            v19.16b,  #5
-        movi            v20.8b,   #55  // idx of last 5
-        movi            v21.8b,   #72  // idx of last 4
-        movi            v22.8b,   #101 // idx of last 3
-        movi            v23.8b,   #169 // idx of last 2
-        movi            v24.8b,   #254 // idx of last 1
        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
        movi            v29.8h,   #1, lsl #8
        dup             v30.4s,   w13  // one_by_x

-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
+        ld1             {v8.4s,  v9.4s,  v10.4s, v11.4s}, [x5], #64
+        ld1             {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        ld1             {v0.8h,  v1.8h},  [x7], #32
+1:
+        ld1             {v2.8h,  v3.8h},   [x1], #32
+        add             v8.4s,   v8.4s,   v12.4s
+        add             v9.4s,   v9.4s,   v13.4s
+        add             v10.4s,  v10.4s,  v14.4s
+        add             v11.4s,  v11.4s,  v15.4s
+        add             v0.8h,   v0.8h,   v20.8h
+        add             v1.8h,   v1.8h,   v21.8h
+
+        add             v16.4s,  v16.4s,  v8.4s
+        add             v17.4s,  v17.4s,  v9.4s
+        add             v18.4s,  v18.4s,  v10.4s
+        add             v19.4s,  v19.4s,  v11.4s
+        add             v4.8h,   v2.8h,   v0.8h
+        add             v5.8h,   v3.8h,   v1.8h
+
+        srshl           v16.4s,  v16.4s,  v7.4s
+        srshl           v17.4s,  v17.4s,  v7.4s
+        srshl           v18.4s,  v18.4s,  v7.4s
+        srshl           v19.4s,  v19.4s,  v7.4s
+        srshl           v9.8h,   v4.8h,   v6.8h
+        srshl           v13.8h,  v5.8h,   v6.8h
+        mul             v16.4s,  v16.4s,  v31.4s // a * n
+        mul             v17.4s,  v17.4s,  v31.4s // a * n
+        mul             v18.4s,  v18.4s,  v31.4s // a * n
+        mul             v19.4s,  v19.4s,  v31.4s // a * n
+        umull           v8.4s,   v9.4h,   v9.4h  // b * b
+        umull2          v9.4s,   v9.8h,   v9.8h  // b * b
+        umull           v12.4s,  v13.4h,  v13.4h // b * b
+        umull2          v13.4s,  v13.8h,  v13.8h // b * b
+        uqsub           v16.4s,  v16.4s,  v8.4s  // imax(a * n - b * b, 0)
+        uqsub           v17.4s,  v17.4s,  v9.4s  // imax(a * n - b * b, 0)
+        uqsub           v18.4s,  v18.4s,  v12.4s // imax(a * n - b * b, 0)
+        uqsub           v19.4s,  v19.4s,  v13.4s // imax(a * n - b * b, 0)
+        mul             v16.4s,  v16.4s,  v28.4s // p * s
+        mul             v17.4s,  v17.4s,  v28.4s // p * s
+        mul             v18.4s,  v18.4s,  v28.4s // p * s
+        mul             v19.4s,  v19.4s,  v28.4s // p * s
+        uqshrn          v16.4h,  v16.4s,  #16
+        uqshrn2         v16.8h,  v17.4s,  #16
+        uqshrn          v18.4h,  v18.4s,  #16
+        uqshrn2         v18.8h,  v19.4s,  #16
+        uqrshrn         v1.8b,   v16.8h,  #4     // imin(z, 255)
+        uqrshrn2        v1.16b,  v18.8h,  #4     // imin(z, 255)
+
+        ld1             {v16.4s, v17.4s}, [x0], #32
+        subs            w4,  w4,  #16
+
+        ushr            v0.16b,  v1.16b,  #3
+        ld1             {v8.4s,  v9.4s}, [x5], #32
+        tbl             v2.16b,  {v26.16b, v27.16b}, v0.16b // RangeMins
+        tbl             v0.16b,  {v24.16b, v25.16b}, v0.16b // DiffMasks
+        tbl             v3.16b,  {v23.16b}, v1.16b          // RefLo
+        and             v1.16b,  v1.16b,   v22.16b
+        ld1             {v12.4s, v13.4s}, [x6], #32
+        ushl            v1.16b,  v2.16b,  v1.16b
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        add             v3.16b,  v3.16b,  v0.16b
+        cnt             v1.16b,  v1.16b
+        ld1             {v18.4s, v19.4s}, [x0], #32
+        add             v3.16b,  v3.16b,  v1.16b
+        ld1             {v10.4s, v11.4s}, [x5], #32
+        uxtl            v0.8h,   v3.8b           // x
+        uxtl2           v1.8h,   v3.16b          // x
+
+        ld1             {v14.4s, v15.4s}, [x6], #32
+
+        umull           v2.4s,   v0.4h,   v4.4h // x * BB[i]
+        umull2          v3.4s,   v0.8h,   v4.8h // x * BB[i]
+        umull           v4.4s,   v1.4h,   v5.4h // x * BB[i]
+        umull2          v5.4s,   v1.8h,   v5.8h // x * BB[i]
+        sub             v0.8h,   v29.8h,  v0.8h // 256 - x
+        sub             v1.8h,   v29.8h,  v1.8h // 256 - x
+        mul             v2.4s,   v2.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v3.4s,   v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,   v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v5.4s,   v5.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        st1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v0.8h, v1.8h}, [x7], #32
+        srshr           v2.4s,   v2.4s,  #12    // AA[i]
+        srshr           v3.4s,   v3.4s,  #12    // AA[i]
+        srshr           v4.4s,   v4.4s,  #12    // AA[i]
+        srshr           v5.4s,   v5.4s,  #12    // AA[i]
+
+        st1             {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
+        b.gt            1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        ret
+endfunc
+
+// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+//                               int32_t *AA, int16_t *BB,
+//                               const int w, const int s,
+//                               const int bitdepth_max);
+function sgr_box5_vert_neon, export=1
+        stp             d8,  d9,  [sp, #-0x30]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+
+        add             w4,  w4,  #2
+        clz             w15, w6        // bitdepth_max
+        dup             v28.4s,   w5   // strength
+
+        ldp             x5,  x6,  [x0]
+        ldp             x7,  x8,  [x0, #16]
+        ldr             x0,       [x0, #32]
+        ldp             x9,  x10, [x1]
+        ldp             x11, x12, [x1, #16]
+        ldr             x1,       [x1, #32]
+
+        movi            v31.4s,   #25   // n
+
+        sub             w15, w15, #24  // -bitdepth_min_8
+        movrel          x13, x_by_x_tables
+        movi            v30.4s,  #164
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
+        dup             v6.8h,   w15  // -bitdepth_min_8
+        movi            v19.8b,  #0x7
+        ldr             q18, [x13, #64] // RefLo
+        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
+        movi            v29.8h,  #1, lsl #8

        ld1             {v8.4s,  v9.4s},  [x5], #32
        ld1             {v10.4s, v11.4s}, [x6], #32
-        ld1             {v12.8h},         [x7], #16
-        ld1             {v13.8h},         [x8], #16
-        ld1             {v0.4s, v1.4s},   [x0], #32
+        ld1             {v12.4s, v13.4s}, [x7], #32
+        ld1             {v16.4s, v17.4s}, [x8], #32
+        ld1             {v20.8h},         [x9], #16
+        ld1             {v21.8h},         [x10], #16
+        ld1             {v22.8h},         [x11], #16
+        ld1             {v23.8h},         [x12], #16
+        ld1             {v0.4s,  v1.4s},  [x0], #32
        ld1             {v2.8h},          [x1], #16
-1:

+1:
        add             v8.4s,   v8.4s,   v10.4s
        add             v9.4s,   v9.4s,   v11.4s
+        add             v12.4s,  v12.4s,  v16.4s
+        add             v13.4s,  v13.4s,  v17.4s

-        add             v12.8h,  v12.8h,  v13.8h
+        add             v20.8h,  v20.8h,  v21.8h
+        add             v22.8h,  v22.8h,  v23.8h

-        subs            w4,  w4,  #8
        add             v0.4s,   v0.4s,   v8.4s
        add             v1.4s,   v1.4s,   v9.4s
-        add             v2.8h,   v2.8h,   v12.8h
+        add             v2.8h,   v2.8h,   v20.8h
+
+        add             v0.4s,   v0.4s,   v12.4s
+        add             v1.4s,   v1.4s,   v13.4s
+        add             v2.8h,   v2.8h,   v22.8h
+
+        subs            w4,  w4,  #8

        srshl           v0.4s,   v0.4s,   v7.4s
        srshl           v1.4s,   v1.4s,   v7.4s
@ -102,24 +300,25 @@ function sgr_box3_vert_neon, export=1
        ld1             {v10.4s, v11.4s}, [x6], #32
        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)

-        ld1             {v12.8h},         [x7], #16
+        ld1             {v12.4s, v13.4s}, [x7], #32

-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
-        add             v5.8b,   v1.8b,   v5.8b
-        ld1             {v13.8h},         [x8], #16
-        add             v5.8b,   v5.8b,   v25.8b
-        ld1             {v0.4s, v1.4s},   [x0], #32
+        ushr            v1.8b,   v0.8b,  #3
+        ld1             {v16.4s, v17.4s}, [x8], #32
+        tbl             v5.8b,   {v26.16b, v27.16b}, v1.8b // RangeMins
+        tbl             v1.8b,   {v24.16b, v25.16b}, v1.8b // DiffMasks
+        tbl             v4.8b,   {v18.16b}, v0.8b          // RefLo
+        and             v0.8b,   v0.8b,  v19.8b
+        ld1             {v20.8h},         [x9], #16
+        ushl            v5.8b,   v5.8b,  v0.8b
+        add             v4.8b,   v4.8b,  v1.8b
+        ld1             {v21.8h},         [x10], #16
+        cnt             v5.8b,   v5.8b
+        ld1             {v22.8h},         [x11], #16
+        add             v5.8b,   v4.8b,  v5.8b
+        ld1             {v23.8h},         [x12], #16
        uxtl            v5.8h,   v5.8b           // x

+        ld1             {v0.4s,  v1.4s},  [x0], #32
        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
@ -138,135 +337,3 @@ function sgr_box3_vert_neon, export=1
        ldp             d8,  d9,  [sp], 0x30
        ret
 endfunc
-
-// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
-//                               int32_t *AA, int16_t *BB,
-//                               const int w, const int s,
-//                               const int bitdepth_max);
-function sgr_box5_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x40]!
-        stp             d10, d11, [sp, #0x10]
-        stp             d12, d13, [sp, #0x20]
-        stp             d14, d15, [sp, #0x30]
-
-        add             w4,  w4,  #2
-        clz             w15, w6        // bitdepth_max
-        dup             v28.4s,   w5   // strength
-
-        ldp             x5,  x6,  [x0]
-        ldp             x7,  x8,  [x0, #16]
-        ldr             x0,       [x0, #32]
-        ldp             x9,  x10, [x1]
-        ldp             x11, x12, [x1, #16]
-        ldr             x1,       [x1, #32]
-
-        movi            v31.4s,   #25   // n
-
-        sub             w15, w15, #24  // -bitdepth_min_8
-        movrel          x13, X(sgr_x_by_x)
-        mov             w14, #164      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x13]
-        dup             v6.8h,   w15  // -bitdepth_min_8
-        movi            v19.16b, #5
-        movi            v24.8b,  #254 // idx of last 1
-        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
-        movi            v29.8h,  #1, lsl #8
-        dup             v30.4s,  w14  // one_by_x
-
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
-
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        ld1             {v12.4s, v13.4s}, [x7], #32
-        ld1             {v14.4s, v15.4s}, [x8], #32
-        ld1             {v20.8h},         [x9], #16
-        ld1             {v21.8h},         [x10], #16
-        ld1             {v22.8h},         [x11], #16
-        ld1             {v23.8h},         [x12], #16
-        ld1             {v0.4s,  v1.4s},  [x0], #32
-        ld1             {v2.8h},          [x1], #16
-
-1:
-        add             v8.4s,   v8.4s,   v10.4s
-        add             v9.4s,   v9.4s,   v11.4s
-        add             v12.4s,  v12.4s,  v14.4s
-        add             v13.4s,  v13.4s,  v15.4s
-
-        add             v20.8h,  v20.8h,  v21.8h
-        add             v22.8h,  v22.8h,  v23.8h
-
-        add             v0.4s,   v0.4s,   v8.4s
-        add             v1.4s,   v1.4s,   v9.4s
-        add             v2.8h,   v2.8h,   v20.8h
-
-        add             v0.4s,   v0.4s,   v12.4s
-        add             v1.4s,   v1.4s,   v13.4s
-        add             v2.8h,   v2.8h,   v22.8h
-
-        subs            w4,  w4,  #8
-
-        movi            v20.8b,  #55  // idx of last 5
-        movi            v21.8b,  #72  // idx of last 4
-        movi            v22.8b,  #101 // idx of last 3
-        movi            v23.8b,  #169 // idx of last 2
-
-        srshl           v0.4s,   v0.4s,   v7.4s
-        srshl           v1.4s,   v1.4s,   v7.4s
-        srshl           v4.8h,   v2.8h,   v6.8h
-        mul             v0.4s,   v0.4s,   v31.4s // a * n
-        mul             v1.4s,   v1.4s,   v31.4s // a * n
-        umull           v3.4s,   v4.4h,   v4.4h  // b * b
-        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
-        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,   v0.4s,   v28.4s // p * s
-        mul             v1.4s,   v1.4s,   v28.4s // p * s
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        uqshrn          v0.4h,   v0.4s,   #16
-        uqshrn2         v0.8h,   v1.4s,   #16
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
-
-        ld1             {v12.4s, v13.4s}, [x7], #32
-
-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        ld1             {v14.4s, v15.4s}, [x8], #32
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
-        ld1             {v20.8h},         [x9], #16
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
-        ld1             {v21.8h},         [x10], #16
-        add             v5.8b,   v1.8b,   v5.8b
-        ld1             {v22.8h},         [x11], #16
-        add             v5.8b,   v5.8b,   v25.8b
-        ld1             {v23.8h},         [x12], #16
-        uxtl            v5.8h,   v5.8b           // x
-
-        ld1             {v0.4s,  v1.4s},  [x0], #32
-        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
-        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
-        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,   v3.4s,   #12    // AA[i]
-        srshr           v4.4s,   v4.4s,   #12    // AA[i]
-        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
-        ld1             {v2.8h},          [x1], #16
-
-        st1             {v3.4s, v4.4s}, [x2], #32
-        st1             {v5.8h}, [x3], #16
-        b.gt            1b
-
-        ldp             d14, d15, [sp, #0x30]
-        ldp             d12, d13, [sp, #0x20]
-        ldp             d10, d11, [sp, #0x10]
-        ldp             d8,  d9,  [sp], 0x40
-        ret
-endfunc
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
--- a/third_party/dav1d/src/arm/64/mc16_sve.S
+++ b/third_party/dav1d/src/arm/64/mc16_sve.S
--- a/third_party/dav1d/src/arm/64/mc_dotprod.S
+++ b/third_party/dav1d/src/arm/64/mc_dotprod.S
@ -54,8 +54,14 @@ const h_tbl_neon_dotprod, align=4
        .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
        .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14

+        // Shuffle indices to permute horizontal samples in preparation for
+        // input to USMMLA instructions.
+#define OFFSET_USMMLA 48
+        .byte  0,  1,  2,  3,   4,  5,  6,  7,   2,  3,  4,  5,   6,  7,  8,  9
+        .byte  4,  5,  6,  7,   8,  9, 10, 11,   6,  7,  8,  9,  10, 11, 12, 13
+
        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
-#define OFFSET_CVT_32_8 48
+#define OFFSET_CVT_32_8 80
        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
 endconst

@ -114,10 +120,10 @@ L(\type\()_8tap_v_\isa):
        sub             \src, \src, \s_strd
 .ifc \isa, neon_dotprod
    .ifc \type, prep
-        mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
+        mov             w8, #0x2002         // FILTER_WEIGHT * 128 + rounding
        dup             v4.4s, w8
    .else
-        movi            v4.4s, #32, lsl 8   // FILTER_WEIGHT * 128, bias for SDOT
+        movi            v4.4s, #32, lsl #8  // FILTER_WEIGHT * 128, bias for SDOT
    .endif
 .endif
        ubfx            w11, \my, #7, #7
@ -677,18 +683,18 @@ L(\type\()_8tap_h_hv_\isa):
        madd            \mx, \mx, w11, w9
        madd            w14, \my, w11, w10      // for HV
 .ifc \isa, neon_dotprod
-        mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
+        mov             w13, #0x2002            // FILTER_WEIGHT * 128 + rounding
        dup             v27.4s, w13             // put H overrides this
 .endif
        movrel          x13, h_tbl_neon_dotprod
        sub             \src, \src, #3          // src - 3
-        ldr             q28, [x13]
-        ubfx            w9, \mx, #7, #7
+        ldr             q28, [x13]              // for 4-tap & 8-tap H filters
+        ubfx            w15, \mx, #7, #7
        and             \mx, \mx, #0x7F
        ubfx            w11, w14, #7, #7        // for HV
        and             w14, w14, #0x7F         // for HV
        cmp             \w, #4
-        csel            \mx, \mx, w9, le
+        csel            \mx, \mx, w15, le
        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
 .ifc \isa, neon_dotprod
        movi            v24.16b, #128
@ -706,7 +712,7 @@ L(\type\()_8tap_h_hv_\isa):
        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
 .endif                                                 // of 32b values to 8b
        sxtl            v7.8h, v7.8b
-        cmp             w10, SHARP1
+        cmp             w10, #SHARP1
        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1

        // HV 8-tap cases
@ -1005,11 +1011,92 @@ L(\type\()_6tap_hv_\isa):

        // .align JUMP_ALIGN    // fallthrough
 80:     // HV6 - 8xN+
-        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
 .ifc \type, prep
        add             \wd_strd, \w, \w
 .endif
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            88f             // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v16.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v17.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v18.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v19.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v20.8h, v22.8h, #2
+
+        .align LOOP_ALIGN
+8:
+        ld1             {v23.16b}, [\lsrc], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smull2          v1.4s, v16.8h, v7.h[1]
+        mov             v16.16b, v17.16b
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        tbl             v2.16b, {v23.16b}, v29.16b
+        tbl             v3.16b, {v23.16b}, v30.16b
+
+        smlal           v0.4s, v17.4h, v7.h[2]
+        smlal2          v1.4s, v17.8h, v7.h[2]
+        mov             v17.16b, v18.16b
+
+        usmmla          v5.4s, v2.16b, v26.16b
+        usmmla          v6.4s, v3.16b, v26.16b
+
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal2          v1.4s, v18.8h, v7.h[3]
+        mov             v18.16b, v19.16b
+        subs            w8, w8, #1
+
+        smlal           v0.4s, v19.4h, v7.h[4]
+        smlal2          v1.4s, v19.8h, v7.h[4]
+        uzp1            v23.8h, v5.8h, v6.8h
+        mov             v19.16b, v20.16b
+
+        smlal           v0.4s, v20.4h, v7.h[5]
+        smlal2          v1.4s, v20.8h, v7.h[5]
+        srshr           v20.8h, v23.8h, #2
+        smlal           v0.4s, v20.4h, v7.h[6]
+        smlal2          v1.4s, v20.8h, v7.h[6]
+    .ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #16
+    .else
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+        st1             {v0.8b}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #8
+    .endif
+        add             \src, \src, #8
+        subs            \w, \w, #8
+        b.gt            81b
+        ret             x15
+
+        .align JUMP_ALIGN
+88:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
+
        .align LOOP_ALIGN
 81:
        mov             \lsrc, \src
@ -1040,8 +1127,8 @@ L(\type\()_6tap_hv_\isa):
 .endif
        .align LOOP_ALIGN
 8:
-        ldr             q23, [\xmy]
-        add             \xmy, \xmy, \s_strd
+        ldr             q23, [\lsrc]
+        add             \lsrc, \lsrc, \s_strd

        smull           v0.4s, v16.4h, v7.h[1]
        smull2          v1.4s, v16.8h, v7.h[1]
@ -1128,6 +1215,20 @@ L(\type\()_hv_filter8_\isa):
        uzp1            v22.8h, v22.8h, v23.8h
        ret

+.ifc \isa, neon_i8mm
+        .align FUNC_ALIGN
+L(\type\()_hv_filter6_neon_i8mm):
+        ld1             {v4.16b}, [\lsrc], \s_strd
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+        tbl             v2.16b, {v4.16b}, v29.16b
+        tbl             v3.16b, {v4.16b}, v30.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+        uzp1            v22.8h, v22.8h, v23.8h
+        ret
+.endif
+
        .align FUNC_ALIGN
 L(\type\()_hv_filter4_\isa):
        ld1             {v4.8b}, [\src], \s_strd
@ -1264,8 +1365,8 @@ L(\type\()_hv_filter4_\isa):

        .align JUMP_ALIGN
 L(\type\()_8tap_h_\isa):
-        adr             x9, L(\type\()_8tap_h_\isa\()_tbl)
-        ldrh            w8, [x9, x8, lsl #1]
+        movrel          x11, \type\()_8tap_h_\isa\()_tbl
+        ldrsw           x8, [x11, x8, lsl #2]
 .ifc \type, put
    .ifc \isa, neon_i8mm
        movi            v27.4s, #34     // special rounding
@ -1274,8 +1375,8 @@ L(\type\()_8tap_h_\isa):
        dup             v27.4s, w10
    .endif
 .endif
-        sub             x9, x9, x8
-        br              x9
+        add             x11, x11, x8
+        br              x11

 .ifc \type, put
        .align JUMP_ALIGN
@ -1368,8 +1469,63 @@ L(\type\()_8tap_h_\isa):
        .align JUMP_ALIGN
 80:     // H - 8xN
        AARCH64_VALID_JUMP_TARGET
-        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            88f             // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+8:
+        ldr             q0, [\src]
+        ldr             q16, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+    .ifc \type, prep
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v20.4s, #0
+        movi            v21.4s, #0
+    .else
+        mov             v4.16b, v27.16b
+        mov             v5.16b, v27.16b
+        mov             v20.16b, v27.16b
+        mov             v21.16b, v27.16b
+    .endif
+        tbl             v1.16b, {v0.16b}, v29.16b
+        tbl             v2.16b, {v0.16b}, v30.16b
+        tbl             v17.16b, {v16.16b}, v29.16b
+        tbl             v18.16b, {v16.16b}, v30.16b
+
+        usmmla          v4.4s, v1.16b, v26.16b
+        usmmla          v5.4s, v2.16b, v26.16b
+        usmmla          v20.4s, v17.16b, v26.16b
+        usmmla          v21.4s, v18.16b, v26.16b
+
+        uzp1            v4.8h, v4.8h, v5.8h
+        uzp1            v20.8h, v20.8h, v21.8h
+    .ifc \type, prep
+        srshr           v4.8h, v4.8h, #2
+        srshr           v20.8h, v20.8h, #2
+        subs            \h, \h, #2
+        stp             q4, q20, [\dst], #32
+    .else   // put
+        sqshrun         v4.8b, v4.8h, #6
+        sqshrun         v20.8b, v20.8h, #6
+        subs            \h, \h, #2
+        str             d4, [\dst]
+        str             d20, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+    .endif
+        b.gt            8b
+        ret
+
+        .align JUMP_ALIGN
+88:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]

        .align LOOP_ALIGN
 8:
@ -1433,8 +1589,61 @@ L(\type\()_8tap_h_\isa):
        .align JUMP_ALIGN
 160:    // H - 16xN
        AARCH64_VALID_JUMP_TARGET
-        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            168f            // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+16:
+        ldr             q16, [\src]
+        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
+        add             \src, \src, \s_strd
+    .ifc \type, prep
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+    .else
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+    .endif
+        tbl             v0.16b, {v16.16b}, v29.16b
+        tbl             v1.16b, {v16.16b}, v30.16b
+        tbl             v2.16b, {v17.16b}, v29.16b
+        tbl             v3.16b, {v17.16b}, v30.16b
+
+        usmmla          v6.4s, v0.16b, v26.16b
+        usmmla          v7.4s, v1.16b, v26.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+    .ifc \type, prep
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+        subs            \h, \h, #1
+        stp             q6, q22, [\dst], #32
+    .else   // put
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            \h, \h, #1
+        st1             {v6.16b}, [\dst], \d_strd
+    .endif
+        b.gt            16b
+        ret
+
+        .align JUMP_ALIGN
+168:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]

        .align LOOP_ALIGN
 16:
@ -1497,7 +1706,6 @@ L(\type\()_8tap_h_\isa):
 640:
 1280:
        AARCH64_VALID_JUMP_TARGET
-        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
 .ifc \type, put
        sub             \d_strd, \d_strd, \w, uxtw
@ -1505,6 +1713,69 @@ L(\type\()_8tap_h_\isa):
        sub             \s_strd, \s_strd, \w, uxtw
        mov             w8, \w

+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            328f            // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+32:
+        ldr             q16, [\src]
+        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
+        add             \src, \src, #16
+    .ifc \type, prep
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+    .else
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+    .endif
+        tbl             v0.16b, {v16.16b}, v29.16b
+        tbl             v1.16b, {v16.16b}, v30.16b
+        tbl             v2.16b, {v17.16b}, v29.16b
+        tbl             v3.16b, {v17.16b}, v30.16b
+
+        usmmla          v6.4s, v0.16b, v26.16b
+        usmmla          v7.4s, v1.16b, v26.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+    .ifc \type, prep
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+        subs            w8, w8, #16
+        stp             q6, q22, [\dst], #32
+    .else   // put
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            w8, w8, #16
+        str             q6, [\dst], #16
+    .endif
+        b.gt            32b
+
+        add             \src, \src, \s_strd
+    .ifc \type, put
+        add             \dst, \dst, \d_strd
+    .endif
+        mov             w8, \w
+        subs            \h, \h, #1
+        b.gt            32b
+        ret
+
+        .align JUMP_ALIGN
+328:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
+
        .align LOOP_ALIGN
 32:
        ldr             q16, [\src]
@ -1568,19 +1839,19 @@ L(\type\()_8tap_h_\isa):
        subs            \h, \h, #1
        b.gt            32b
        ret
-
-L(\type\()_8tap_h_\isa\()_tbl):
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
-.ifc \type, put
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
-        .hword 0
-.endif
 endfunc
+
+jumptable \type\()_8tap_h_\isa\()_tbl
+        .word 1280b - \type\()_8tap_h_\isa\()_tbl
+        .word 640b  - \type\()_8tap_h_\isa\()_tbl
+        .word 320b  - \type\()_8tap_h_\isa\()_tbl
+        .word 160b  - \type\()_8tap_h_\isa\()_tbl
+        .word 80b   - \type\()_8tap_h_\isa\()_tbl
+        .word 40b   - \type\()_8tap_h_\isa\()_tbl
+.ifc \type, put
+        .word 20b   - \type\()_8tap_h_\isa\()_tbl
+.endif
+endjumptable
 .endm

 // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
--- a/third_party/dav1d/src/arm/64/refmvs.S
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@ -34,13 +34,13 @@
 function splat_mv_neon, export=1
        ld1             {v3.16b},  [x1]
        clz             w3,  w3
-        adr             x5,  L(splat_tbl)
+        movrel          x5,  splat_tbl
        sub             w3,  w3,  #26
        ext             v2.16b,  v3.16b,  v3.16b,  #12
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
        add             w2,  w2,  w2,  lsl #1
        ext             v0.16b,  v2.16b,  v3.16b,  #4
-        sub             x3,  x5,  w3, uxtw
+        add             x3,  x5,  x3
        ext             v1.16b,  v2.16b,  v3.16b,  #8
        lsl             w2,  w2,  #2
        ext             v2.16b,  v2.16b,  v3.16b,  #12
@ -80,16 +80,17 @@ function splat_mv_neon, export=1
        st1             {v0.16b, v1.16b, v2.16b}, [x1]
        b.gt            1b
        ret
-
-L(splat_tbl):
-        .hword L(splat_tbl) -  320b
-        .hword L(splat_tbl) -  160b
-        .hword L(splat_tbl) -   80b
-        .hword L(splat_tbl) -   40b
-        .hword L(splat_tbl) -   20b
-        .hword L(splat_tbl) -   10b
 endfunc

+jumptable splat_tbl
+        .word 320b  - splat_tbl
+        .word 160b  - splat_tbl
+        .word 80b   - splat_tbl
+        .word 40b   - splat_tbl
+        .word 20b   - splat_tbl
+        .word 10b   - splat_tbl
+endjumptable
+
 const mv_tbls, align=4
        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
@ -112,7 +113,7 @@ function save_tmvs_neon, export=1

        movi            v30.8b,  #0
        ld1             {v31.8b}, [x3]
-        adr             x8,  L(save_tmvs_tbl)
+        movrel          x8,  save_tmvs_tbl
        movrel          x16, mask_mult
        movrel          x13, mv_tbls
        ld1             {v29.8b}, [x16]
@ -137,9 +138,9 @@ function save_tmvs_neon, export=1
 2:
        ldrb            w11, [x9, #10]            // cand_b->bs
        ld1             {v0.16b}, [x9]            // cand_b->mv
-        add             x11, x8,  w11, uxtw #2
+        add             x11, x8,  w11, uxtw #3
        ldr             h1,  [x9, #8]             // cand_b->ref
-        ldrh            w12, [x11]                // bw8
+        ldr             w12, [x11]                // bw8
        mov             x15, x8
        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
        cmp             x9,  x10
@ -149,9 +150,9 @@ function save_tmvs_neon, export=1
        ldrb            w15, [x9, #10]            // cand_b->bs
        add             x16, x9,  #8
        ld1             {v4.16b}, [x9]            // cand_b->mv
-        add             x15, x8,  w15, uxtw #2
+        add             x15, x8,  w15, uxtw #3
        ld1             {v1.h}[1], [x16]          // cand_b->ref
-        ldrh            w12, [x15]                // bw8
+        ldr             w12, [x15]                // bw8
        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
        trn1            v2.2d,   v0.2d,   v4.2d

@ -166,12 +167,12 @@ function save_tmvs_neon, export=1
        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
        umov            w16, v1.h[0]              // Extract case for first block
        umov            w17, v1.h[1]
-        ldrh            w11, [x11, #2]            // Fetch jump table entry
-        ldrh            w15, [x15, #2]
+        ldrsw           x11, [x11, #4]            // Fetch jump table entry
+        ldrsw           x15, [x15, #4]
        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
        ldr             q5, [x13, w17, uxtw #4]
-        sub             x11, x8,  w11, uxtw       // Find jump table target
-        sub             x15, x8,  w15, uxtw
+        add             x11, x8,  x11             // Find jump table target
+        add             x15, x8,  x15
        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
        tbl             v4.16b, {v4.16b}, v5.16b

@ -243,50 +244,51 @@ function save_tmvs_neon, export=1
        str             q2, [x3, #(16*5-16)]
        add             x3,  x3,  #16*5
        ret
-
-L(save_tmvs_tbl):
-        .hword 16 * 12
-        .hword L(save_tmvs_tbl) - 160b
-        .hword 16 * 12
-        .hword L(save_tmvs_tbl) - 160b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
 endfunc
+
+jumptable save_tmvs_tbl
+        .word 16 * 12
+        .word 160b - save_tmvs_tbl
+        .word 16 * 12
+        .word 160b - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+endjumptable
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@ -323,6 +323,32 @@ EXTERN\name:
 \name:
 .endm

+.macro jumptable name
+#ifdef _WIN32
+// MS armasm64 doesn't seem to be able to create relocations for subtraction
+// of labels in different sections; for armasm64 (and all of Windows for
+// simplicity), write the jump table in the text section, to allow calculating
+// differences at assembly time. See
+// https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340
+// for reference. (LLVM can create such relocations, but checking for _WIN32
+// for simplicity, as execute-only memory isn't relevant on Windows at the
+// moment.)
+        function \name
+#else
+// For other platforms, write jump tables in a const data section, to allow
+// working in environments where executable memory isn't readable.
+        const \name
+#endif
+.endm
+
+.macro endjumptable
+#ifdef _WIN32
+        endfunc
+#else
+        endconst
+#endif
+.endm
+
 #ifdef __APPLE__
 #define L(x) L ## x
 #else
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@ -29,6 +29,7 @@

 #include "common/attributes.h"

+#include "src/cpu.h"
 #include "src/arm/cpu.h"

 #if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
    elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
 #endif

-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
    flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
    flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
    flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
@ -75,7 +76,8 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
 #endif

-    unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
+    flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
    flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
    flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
    return flags;
@ -95,7 +97,7 @@ static int have_feature(const char *feature) {
 }

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
    if (have_feature("hw.optional.arm.FEAT_DotProd"))
        flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
    if (have_feature("hw.optional.arm.FEAT_I8MM"))
@ -104,16 +106,14 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
    return flags;
 }

-#elif defined(__OpenBSD__)
-
-#if ARCH_AARCH64
+#elif defined(__OpenBSD__) && ARCH_AARCH64
 #include <machine/armreg.h>
 #include <machine/cpu.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-     unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+     unsigned flags = dav1d_get_default_cpu_flags();

 #ifdef CPU_ID_AA64ISAR0
     int mib[2];
@ -142,25 +142,31 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {

     return flags;
 }
-#else  /* !ARCH_AARCH64 */
-
-COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
-    return flags;
-}
-#endif /* ARCH_AARCH64 */

 #elif defined(_WIN32)
 #include <windows.h>

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
 #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
        flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
 #endif
-    /* No I8MM or SVE feature detection available on Windows at the time of
-     * writing. */
+#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_SVE;
+#endif
+#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_SVE2;
+#endif
+#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
+    /* There's no PF_* flag that indicates whether plain I8MM is available
+     * or not. But if SVE_I8MM is available, that also implies that
+     * regular I8MM is available. */
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
    return flags;
 }

@ -206,7 +212,8 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
 }

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
+    flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
    flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
    flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
    flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
@ -220,7 +227,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
 #else  /* Unsupported OS */

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    return 0;
+    return dav1d_get_default_cpu_flags();
 }

 #endif
--- a/third_party/dav1d/src/arm/mc.h
+++ b/third_party/dav1d/src/arm/mc.h
@ -63,6 +63,7 @@
 decl_8tap_fns(neon);
 decl_8tap_fns(neon_dotprod);
 decl_8tap_fns(neon_i8mm);
+decl_8tap_fns(sve2);

 decl_mc_fn(BF(dav1d_put_bilin, neon));
 decl_mct_fn(BF(dav1d_prep_bilin, neon));
@ -110,17 +111,27 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
    c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
    c->emu_edge = BF(dav1d_emu_edge, neon);

-#if ARCH_AARCH64 && BITDEPTH == 8
+#if ARCH_AARCH64
+#if BITDEPTH == 8
 #if HAVE_DOTPROD
-    if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
-
-    init_8tap_fns(neon_dotprod);
+    if (flags & DAV1D_ARM_CPU_FLAG_DOTPROD) {
+        init_8tap_fns(neon_dotprod);
+    }
 #endif  // HAVE_DOTPROD

 #if HAVE_I8MM
-    if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
-
-    init_8tap_fns(neon_i8mm);
+    if (flags & DAV1D_ARM_CPU_FLAG_I8MM) {
+        init_8tap_fns(neon_i8mm);
+    }
 #endif  // HAVE_I8MM
-#endif  // ARCH_AARCH64 && BITDEPTH == 8
+#endif  // BITDEPTH == 8
+
+#if BITDEPTH == 16
+#if HAVE_SVE2
+    if (flags & DAV1D_ARM_CPU_FLAG_SVE2) {
+        init_8tap_fns(sve2);
+    }
+#endif  // HAVE_SVE2
+#endif  // BITDEPTH == 16
+#endif  // ARCH_AARCH64
 }
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@ -33,20 +33,24 @@

 #ifdef _WIN32
 #include <windows.h>
-#elif defined(__APPLE__)
+#endif
+#ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
-#else
-#include <pthread.h>
+#endif
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif

+#ifdef HAVE_PTHREAD_GETAFFINITY_NP
+#include <pthread.h>
 #ifdef HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
 #if defined(__FreeBSD__)
 #define cpu_set_t cpuset_t
 #endif
+#endif

 unsigned dav1d_cpu_flags = 0U;
 unsigned dav1d_cpu_flags_mask = ~0U;
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@ -54,12 +54,9 @@ void dav1d_init_cpu(void);
 DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
 int dav1d_num_logical_processors(Dav1dContext *c);

-static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
-    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
+    unsigned flags = 0;

-#if TRIM_DSP_FUNCTIONS
-/* Since this function is inlined, unconditionally setting a flag here will
- * enable dead code elimination in the calling function. */
 #if ARCH_AARCH64 || ARCH_ARM
 #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
    flags |= DAV1D_ARM_CPU_FLAG_NEON;
@ -119,6 +116,17 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
    flags |= DAV1D_X86_CPU_FLAG_SSE2;
 #endif
 #endif
+
+    return flags;
+}
+
+static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
+    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+    flags |= dav1d_get_default_cpu_flags();
 #endif

    return flags;
--- a/third_party/dav1d/src/loongarch/cpu.c
+++ b/third_party/dav1d/src/loongarch/cpu.c
@ -26,6 +26,8 @@

 #include "config.h"
 #include "common/attributes.h"
+
+#include "src/cpu.h"
 #include "src/loongarch/cpu.h"

 #if defined(HAVE_GETAUXVAL)
@ -36,7 +38,7 @@
 #endif

 COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
-    unsigned flags = 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
 #if defined(HAVE_GETAUXVAL)
    unsigned long hw_cap = getauxval(AT_HWCAP);
    flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
--- a/third_party/dav1d/src/mem.c
+++ b/third_party/dav1d/src/mem.c
@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
 void *dav1d_alloc_aligned(const enum AllocationType type,
                          const size_t sz, const size_t align)
 {
-    assert(!(align & (align - 1)));
-    void *ptr;
-#ifdef _WIN32
-    ptr = _aligned_malloc(sz + align, align);
-#elif defined(HAVE_POSIX_MEMALIGN)
-    if (posix_memalign(&ptr, align, sz + align)) return NULL;
-#else
-    ptr = memalign(align, sz + align);
-#endif
-
+    void *const ptr = dav1d_alloc_aligned_internal(align, sz + align);
    return track_alloc(type, ptr, sz, align);
 }

@ -140,12 +131,7 @@ void dav1d_free(void *ptr) {

 void dav1d_free_aligned(void *ptr) {
    if (ptr) {
-        ptr = track_free(ptr);
-#ifdef _WIN32
-        _aligned_free(ptr);
-#else
-        free(ptr);
-#endif
+        dav1d_free_aligned_internal(track_free(ptr));
    }
 }

--- a/third_party/dav1d/src/mem.h
+++ b/third_party/dav1d/src/mem.h
@ -32,7 +32,7 @@

 #include <stdlib.h>

-#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
+#if defined(_WIN32) || defined(HAVE_MEMALIGN)
 #include <malloc.h>
 #endif

@ -79,6 +79,39 @@ typedef struct Dav1dMemPool {
 #endif
 } Dav1dMemPool;

+// TODO: Move this to a common location?
+#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
+    assert(!(align & (align - 1)));
+#ifdef _WIN32
+    return _aligned_malloc(sz, align);
+#elif defined(HAVE_POSIX_MEMALIGN)
+    void *ptr;
+    if (posix_memalign(&ptr, align, sz)) return NULL;
+    return ptr;
+#elif defined(HAVE_MEMALIGN)
+    return memalign(align, sz);
+#elif defined(HAVE_ALIGNED_ALLOC)
+    // The C11 standard specifies that the size parameter
+    // must be an integral multiple of alignment.
+    return aligned_alloc(align, ROUND_UP(sz, align));
+#else
+#error No aligned allocation functions are available
+#endif
+}
+
+static inline void dav1d_free_aligned_internal(void *ptr) {
+#ifdef _WIN32
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}

 #if TRACK_HEAP_ALLOCATIONS
 void *dav1d_malloc(enum AllocationType type, size_t sz);
@ -91,34 +124,9 @@ void dav1d_log_alloc_stats(Dav1dContext *c);
 #define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
 #define dav1d_malloc(type, sz) malloc(sz)
 #define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
+#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
 #define dav1d_free(ptr) free(ptr)
-
-/*
- * Allocate align-byte aligned memory. The return value can be released
- * by calling the dav1d_free_aligned() function.
- */
-static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
-    assert(!(align & (align - 1)));
-#ifdef _WIN32
-    return _aligned_malloc(sz, align);
-#elif defined(HAVE_POSIX_MEMALIGN)
-    void *ptr;
-    if (posix_memalign(&ptr, align, sz)) return NULL;
-    return ptr;
-#else
-    return memalign(align, sz);
-#endif
-}
-#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
-
-static inline void dav1d_free_aligned(void *ptr) {
-#ifdef _WIN32
-    _aligned_free(ptr);
-#else
-    free(ptr);
-#endif
-}
-
+#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
 #endif /* TRACK_HEAP_ALLOCATIONS */

 void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -119,6 +119,7 @@ if is_asm_enabled
                    'arm/64/loopfilter16.S',
                    'arm/64/looprestoration16.S',
                    'arm/64/mc16.S',
+                    'arm/64/mc16_sve.S',
                )
            endif
        elif host_machine.cpu_family().startswith('arm')
@ -370,7 +371,7 @@ libdav1d = library('dav1d',
 )

 dav1d_dep = declare_dependency(link_with: libdav1d,
-    include_directories : include_directories('../include/dav1d')
+    include_directories : include_directories('../include')
 )

 #
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -201,16 +201,6 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
                                  (void **) &p->progress);
    if (res) return res;

-    dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
-                             c->mastering_display, c->mastering_display_ref,
-                             c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
-                             &f->tile[0].data.m);
-
-    // Must be removed from the context after being attached to the frame
-    dav1d_ref_dec(&c->itut_t35_ref);
-    c->itut_t35 = NULL;
-    c->n_itut_t35 = 0;
-
    // Don't clear these flags from c->frame_flags if the frame is not going to be output.
    // This way they will be added to the next visible frame too.
    const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
@ -221,6 +211,22 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f

    p->visible = f->frame_hdr->show_frame;
    p->showable = f->frame_hdr->showable_frame;
+
+    if (p->visible) {
+        // Only add HDR10+ and T35 metadata when show frame flag is enabled
+        dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
+                                 c->mastering_display, c->mastering_display_ref,
+                                 c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
+                                 &f->tile[0].data.m);
+
+        // Must be removed from the context after being attached to the frame
+        dav1d_ref_dec(&c->itut_t35_ref);
+        c->itut_t35 = NULL;
+        c->n_itut_t35 = 0;
+    } else {
+        dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m);
+    }
+
    if (c->n_fc > 1) {
        atomic_init(&p->progress[0], 0);
        atomic_init(&p->progress[1], 0);
--- a/third_party/dav1d/src/ppc/cpu.c
+++ b/third_party/dav1d/src/ppc/cpu.c
@ -29,6 +29,7 @@

 #include "common/attributes.h"

+#include "src/cpu.h"
 #include "src/ppc/cpu.h"

 #if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
@ -37,7 +38,7 @@
 #endif

 COLD unsigned dav1d_get_cpu_flags_ppc(void) {
-    unsigned flags = 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
 #if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
    unsigned long hw_cap = getauxval(AT_HWCAP);
    unsigned long hw_cap2 = getauxval(AT_HWCAP2);
--- a/third_party/dav1d/src/refmvs.h
+++ b/third_party/dav1d/src/refmvs.h
@ -43,22 +43,26 @@ PACKED(typedef struct refmvs_temporal_block {
    mv mv;
    int8_t ref;
 }) refmvs_temporal_block;
+CHECK_SIZE(refmvs_temporal_block, 5);

-typedef union refmvs_refpair {
+PACKED(typedef union refmvs_refpair {
    int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
    uint16_t pair;
-} refmvs_refpair;
+}) ALIGN(refmvs_refpair, 2);
+CHECK_SIZE(refmvs_refpair, 2);

 typedef union refmvs_mvpair {
    mv mv[2];
    uint64_t n;
 } refmvs_mvpair;
+CHECK_SIZE(refmvs_mvpair, 8);

 PACKED(typedef struct refmvs_block {
    refmvs_mvpair mv;
    refmvs_refpair ref;
    uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
 }) ALIGN(refmvs_block, 4);
+CHECK_SIZE(refmvs_block, 12);

 typedef struct refmvs_frame {
    const Dav1dFrameHeader *frm_hdr;
--- a/third_party/dav1d/src/riscv/cpu.c
+++ b/third_party/dav1d/src/riscv/cpu.c
@ -29,6 +29,7 @@

 #include "common/attributes.h"

+#include "src/cpu.h"
 #include "src/riscv/cpu.h"

 #if defined(HAVE_GETAUXVAL)
@ -41,7 +42,7 @@
 int dav1d_has_compliant_rvv(void);

 COLD unsigned dav1d_get_cpu_flags_riscv(void) {
-    unsigned flags = 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
 #if defined(HAVE_GETAUXVAL)
    unsigned long hw_cap = getauxval(AT_HWCAP);
    flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;
--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@ -132,6 +132,14 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
 #else

 #include <pthread.h>
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif

 #define dav1d_init_thread() do {} while (0)

@ -145,31 +153,30 @@ static inline void dav1d_set_thread_name(const char *const name) {
    prctl(PR_SET_NAME, name);
 }

-#elif defined(__APPLE__)
+#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__APPLE__)

 static inline void dav1d_set_thread_name(const char *const name) {
    pthread_setname_np(name);
 }

-#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
-
-#if defined(__FreeBSD__)
- /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
-#define _SYS_PARAM_H_
-#include <sys/types.h>
-#endif
-#include <pthread_np.h>
-
-static inline void dav1d_set_thread_name(const char *const name) {
-    pthread_set_name_np(pthread_self(), name);
-}
-
-#elif defined(__NetBSD__)
+#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__NetBSD__)

 static inline void dav1d_set_thread_name(const char *const name) {
    pthread_setname_np(pthread_self(), "%s", (void*)name);
 }

+#elif defined(HAVE_PTHREAD_SETNAME_NP)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(pthread_self(), name);
+}
+
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_set_name_np(pthread_self(), name);
+}
+
 #elif defined(__HAIKU__)

 #include <os/kernel/OS.h>
--- a/third_party/dav1d/src/x86/cpu.c
+++ b/third_party/dav1d/src/x86/cpu.c
@ -32,6 +32,7 @@

 #include "common/attributes.h"

+#include "src/cpu.h"
 #include "src/x86/cpu.h"

 typedef struct {
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
        };
    } cpu;
    dav1d_cpu_cpuid(&cpu.r, 0, 0);
-    unsigned flags = 0;
+    unsigned flags = dav1d_get_default_cpu_flags();

    if (cpu.max_leaf >= 1) {
        CpuidRegisters r;