Bug 1754070: Update libdav1d to b562b7f648e26e64fae892495527b5b275d53183 r=jbauman

Differential Revision: https://phabricator.services.mozilla.com/D138068
2022-02-13 02:58:26 +00:00 · 2022-02-13 02:58:26 +00:00 · 0c73045ce8
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit b562b7f648e26e64fae892495527b5b275d53183 (2022-01-10T14:49:11.000+00:00).
+  release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: b562b7f648e26e64fae892495527b5b275d53183
+  revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
@ -65,3 +65,4 @@ vendoring:
      file: '{yaml_dir}/vcs_version.h'


+
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "b562b7f648e26e64fae892495527b5b275d53183"
+#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4"
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -58,23 +58,35 @@ typedef struct Dav1dLogger {
    void (*callback)(void *cookie, const char *format, va_list ap);
 } Dav1dLogger;

+enum Dav1dInloopFilterType {
+    DAV1D_INLOOPFILTER_NONE        = 0,
+    DAV1D_INLOOPFILTER_DEBLOCK     = 1 << 0,
+    DAV1D_INLOOPFILTER_CDEF        = 1 << 1,
+    DAV1D_INLOOPFILTER_RESTORATION = 1 << 2,
+    DAV1D_INLOOPFILTER_ALL = DAV1D_INLOOPFILTER_DEBLOCK |
+                             DAV1D_INLOOPFILTER_CDEF |
+                             DAV1D_INLOOPFILTER_RESTORATION,
+};
+
 typedef struct Dav1dSettings {
-    int n_threads; ///< number of threads (0 = auto)
-    int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = auto)
-    int apply_grain;
-    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
-    int all_layers; ///< output all spatial layers of a scalable AV1 biststream
-    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
+    int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0)
+    int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0)
+    int apply_grain; ///< whether to apply film grain on output frames (default 1)
+    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31, default 0)
+    int all_layers; ///< output all spatial layers of a scalable AV1 biststream (default 1)
+    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited, default 0)
    Dav1dPicAllocator allocator; ///< Picture allocator callback.
    Dav1dLogger logger; ///< Logger callback.
    int strict_std_compliance; ///< whether to abort decoding on standard compliance violations
                               ///< that don't affect actual bitstream decoding (e.g. inconsistent
-                               ///< or invalid metadata)
+                               ///< or invalid metadata, default 0)
    int output_invisible_frames; ///< output invisibly coded frames (in coding order) in addition
                                 ///< to all visible frames. Because of show-existing-frame, this
                                 ///< means some frames may appear twice (once when coded,
-                                 ///< once when shown)
-    uint8_t reserved[24]; ///< reserved for future use
+                                 ///< once when shown, default 0)
+    enum Dav1dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default
+                                               ///< DAV1D_INLOOPFILTER_ALL)
+    uint8_t reserved[20]; ///< reserved for future use
 } Dav1dSettings;

 /**
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -30,7 +30,7 @@ project('dav1d', ['c'],
                      'b_ndebug=if-release'],
    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '6.3.0'
+dav1d_soname_version       = '6.4.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -109,10 +109,6 @@ if host_machine.system() == 'windows'
        cdata.set('ftello', '_ftelli64')
    endif

-    if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
-        optional_arguments += '-mcmodel=small'
-    endif
-
    # On Windows, we use a compatibility layer to emulate pthread
    thread_dependency = []
    thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -1146,6 +1146,16 @@ endfunc
        vmla.s16        \d,  \s2,  d0[2]
        vmla.s16        \d,  \s3,  d0[3]
 .endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+        vmul.s16        \d0, \s0,  d0[0]
+        vmla.s16        \d0, \s1,  d0[1]
+        vmla.s16        \d0, \s2,  d0[2]
+        vmla.s16        \d0, \s3,  d0[3]
+        vmla.s16        \d0, \s4,  d1[0]
+        vmla.s16        \d0, \s5,  d1[1]
+        vmla.s16        \d0, \s6,  d1[2]
+        vmla.s16        \d0, \s7,  d1[3]
+.endm
 .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
        vmul.s16        \d0, \s0, d0[0]
        vmla.s16        \d0, \s1, d0[1]
@ -1182,24 +1192,6 @@ endfunc
        vmla.s16        \d1, \s8, d1[2]
        vmla.s16        \d1, \s9, d1[3]
 .endm
-.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
-        vmul.s16        \d0, \s0,  d0[0]
-        vmla.s16        \d0, \s1,  d0[1]
-        vmla.s16        \d0, \s2,  d0[2]
-        vmla.s16        \d0, \s3,  d0[3]
-        vmla.s16        \d0, \s4,  d1[0]
-        vmla.s16        \d0, \s5,  d1[1]
-        vmla.s16        \d0, \s6,  d1[2]
-        vmla.s16        \d0, \s7,  d1[3]
-        vmul.s16        \d1, \s4,  d0[0]
-        vmla.s16        \d1, \s5,  d0[1]
-        vmla.s16        \d1, \s6,  d0[2]
-        vmla.s16        \d1, \s7,  d0[3]
-        vmla.s16        \d1, \s8,  d1[0]
-        vmla.s16        \d1, \s9,  d1[1]
-        vmla.s16        \d1, \s10, d1[2]
-        vmla.s16        \d1, \s11, d1[3]
-.endm
 .macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
        vqrshrun.s16    \d0, \q0, #\shift
 .ifnb \q1
@ -1623,7 +1615,7 @@ L(\type\()_8tap_v_tbl):
        st_16           \d_strd, d6, 4
        pop             {r4-r11,pc}

-28:     // 2x8, 2x16 v
+28:     // 2x6, 2x8, 2x12, 2x16 v
        vpush           {q4-q7}
        vld1.8          {d0}, [\my, :64]
        sub             \sr2,  \src,  \s_strd, lsl #1
@ -1642,34 +1634,37 @@ L(\type\()_8tap_v_tbl):
        vmov            d7,  d10
        vmov            d9,  d12
 216:
-        subs            \h,  \h,  #8
+        subs            \h,  \h,  #4
        load_16         \sr2, \src, \s_strd, d16, d18, d20, d22
-        load_16         \sr2, \src, \s_strd, d24, d26, d28, d30
        interleave_1_16 d14, d16, d18, d20, d22
-        interleave_1_16 d22, d24, d26, d28, d30
        vmovl_u8        q7,  d14, q8,  d16, q9,  d18, q10, d20
-        vmovl_u8        q11, d22, q12, d24, q13, d26, q14, d28
        vmov            d11, d14
        vmov            d13, d16
        vmov            d15, d18
        vmov            d17, d20
-        vmov            d19, d22
-        vmov            d21, d24
-        vmov            d23, d26
-        vmov            d25, d28
-        mul_mla_8_4     q1,  q2,  q1,  q2,  q3,  q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12
-        vqrshrun_s16    6,   q1,  d2,  q2,  d4
+        mul_mla_8_0     q1,  q1,  q2,  q3,  q4,  q5,  q6,  q7, q8
+        vqrshrun_s16    6,   q1,  d2
        st_16           \d_strd, d2, 4
-        st_16           \d_strd, d4, 4
        ble             0f
-        vmov            q1,  q9
-        vmov            q2,  q10
-        vmov            q3,  q11
-        vmov            q4,  q12
-        vmov            q5,  q13
-        vmov            q6,  q14
-        vmov            d14, d30
+        cmp             \h,  #2
+        vmov            q1,  q5
+        vmov            q2,  q6
+        vmov            q3,  q7
+        vmov            q4,  q8
+        vmov            q5,  q9
+        vmov            q6,  q10
+        vmov            d14, d22
+        beq             26f
        b               216b
+26:
+        load_16         \sr2, \src, \s_strd, d16, d18
+        interleave_1_16 d14, d16, d18
+        vmovl_u8        q7,  d14, q8,  d16
+        vmov            d11, d14
+        vmov            d13, d16
+        mul_mla_8_0     d2,  d2,  d4,  d6,  d8,  d10, d12, d14, d16
+        vqrshrun_s16    6,   q1,  d2
+        st_16           \d_strd, d2, 2
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
@ -1703,7 +1698,7 @@ L(\type\()_8tap_v_tbl):
 0:
        pop             {r4-r11,pc}

-480:    // 4x8, 4x16 v
+480:    // 4x6, 4x8, 4x12, 4x16 v
        vpush           {q4}
        vld1.8          {d0}, [\my, :64]
        sub             \sr2, \src, \s_strd, lsl #1
@ -1726,12 +1721,19 @@ L(\type\()_8tap_v_tbl):
        mul_mla_8_2     q1,  q2,  q1,  q2,  q3,  q4,  q8,  q9,  q10, q11, q12, q13
        shift_store_4   \type, \d_strd, q1,  d2,  d3,  q2,  d4,  d5
        ble             0f
-        subs            \h,  \h,  #4
-        load_32         \sr2,  \src, \s_strd, d30, d2,  d4,  d6
-        interleave_1_32 d28, d30, d2,  d4,  d6
-        vmovl_u8        q14, d28, q15, d30, q1,  d2,  q2,  d4
-        mul_mla_8_2     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1,  q2
-        shift_store_4   \type, \d_strd, q8,  d16, d17, q9,  d18, d19
+        load_32         \sr2,  \src, \s_strd, d30, d2
+        subs            \h,  \h,  #2
+        interleave_1_32 d28, d30, d2
+        vmovl_u8        q14, d28, q15, d30
+        mul_mla_8_0     q8,  q8,  q9,  q10, q11, q12, q13, q14, q15
+        shift_store_4   \type, \d_strd, q8,  d16, d17
+        ble             0f
+        load_32         \sr2,  \src, \s_strd, d4,  d6
+        subs            \h,  \h,  #2
+        interleave_1_32 d2,  d4,  d6
+        vmovl_u8        q1,  d2,  q2,  d4
+        mul_mla_8_0     q9,  q10, q11, q12, q13, q14, q15, q1,  q2
+        shift_store_4   \type, \d_strd, q9,  d18, d19
        ble             0f
        subs            \h,  \h,  #4
        load_32         \sr2, \src, \s_strd, d8,  d16, d18, d20
@ -2643,6 +2645,7 @@ L(\type\()_bilin_v_tbl):
        // 2x2 v
        vld1.16         {d16[]}, [\src], \s_strd
        bgt             24f
+22:
        vld1.16         {d17[]}, [\sr2], \s_strd
        vld1.16         {d18[]}, [\src], \s_strd
        vext.8          d16, d16, d17, #6
@ -2653,11 +2656,12 @@ L(\type\()_bilin_v_tbl):
        vst1.16         {d4[0]}, [\dst, :16]
        vst1.16         {d4[1]}, [\ds2, :16]
        pop             {r4-r11,pc}
-24:     // 2x4, 2x8, ... v
+24:     // 2x4, 2x6, 2x8, ... v
        vld1.16         {d17[]}, [\sr2], \s_strd
        vld1.16         {d18[]}, [\src], \s_strd
        vld1.16         {d19[]}, [\sr2], \s_strd
        vld1.16         {d20[]}, [\src], \s_strd
+        sub             \h,  \h,  #4
        vext.8          d16, d16, d17, #6
        vext.8          d17, d17, d18, #6
        vext.8          d18, d18, d19, #6
@ -2666,14 +2670,15 @@ L(\type\()_bilin_v_tbl):
        vtrn.32         d17, d19
        vmull.u8        q2,  d16, d2
        vmlal.u8        q2,  d17, d3
-        subs            \h,  \h,  #4
+        cmp             \h,  #2
        vqrshrn.u16     d4,  q2,  #4
        vst1.16         {d4[0]}, [\dst, :16], \d_strd
        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
        vst1.16         {d4[2]}, [\dst, :16], \d_strd
        vst1.16         {d4[3]}, [\ds2, :16], \d_strd
-        ble             0f
+        blt             0f
        vmov            d16, d20
+        beq             22b
        b               24b
 0:
        pop             {r4-r11,pc}
--- a/third_party/dav1d/src/arm/32/mc16.S
+++ b/third_party/dav1d/src/arm/32/mc16.S
@ -1748,7 +1748,7 @@ L(\type\()_8tap_v_tbl):
        vst1_32         \d_strd,  d16, d17
        pop             {r4-r11,pc}

-28:     // 2x8, 2x16 v
+28:     // 2x6, 2x8, 2x12, 2x16 v
        vld1.8          {d0}, [\my, :64]
        sub             \sr2,  \src,  \s_strd, lsl #1
        add             \ds2,  \dst,  \d_strd
@ -1761,25 +1761,29 @@ L(\type\()_8tap_v_tbl):
        interleave_1_32 d2,  d3,  d4,  d5,  d6
        interleave_1_32 d6,  d7,  d16
 216:
-        subs            \h,  \h,  #8
+        subs            \h,  \h,  #4
        load_32         \sr2, \src, \s_strd, d17, d18, d19, d20
-        load_32         \sr2, \src, \s_strd, d21, d22, d23, d24
        interleave_1_32 d16, d17, d18, d19, d20
-        interleave_1_32 d20, d21, d22, d23, d24
        vmull_vmlal_8   q13, d2,  d3,  d4,  d5,  d6,  d7,  d16, d17
        vmull_vmlal_8   q1,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
-        vmull_vmlal_8   q2,  d6,  d7,  d16, d17, d18, d19, d20, d21
-        vmull_vmlal_8   q3,  d16, d17, d18, d19, d20, d21, d22, d23
-        vqrshrun_s32    6,   q13, d26, q1,  d27, q2,  d2,  q3,  d3
-        vmin_u16        q15, q13, q1
+        vqrshrun_s32    6,   q13, d26, q1,  d27
+        vmin_u16        q15, q13
        vst1_32         \d_strd,  d26, d27
-        vst1_32         \d_strd,  d2,  d3
        ble             0f
-        vmov            q1,  q9
-        vmov            q2,  q10
-        vmov            q3,  q11
-        vmov            d16, d24
+        cmp             \h,  #2
+        vmov            q1,  q3
+        vmov            q2,  q8
+        vmov            q3,  q9
+        vmov            d16, d20
+        beq             26f
        b               216b
+26:
+        load_32         \sr2, \src, \s_strd, d17, d18
+        interleave_1_32 d16, d17, d18
+        vmull_vmlal_8   q13, d2,  d3,  d4,  d5,  d6,  d7,  d16, d17
+        vqrshrun_s32    6,   q13, d26
+        vmin_u16        d30, d26
+        vst1_32         \d_strd,  d26
 0:
        pop             {r4-r11,pc}
 .endif
@ -1810,7 +1814,7 @@ L(\type\()_8tap_v_tbl):
 0:
        pop             {r4-r11,pc}

-480:    // 4x8, 4x16 v
+480:    // 4x6, 4x8, 4x12, 4x16 v
        vld1.8          {d0}, [\my, :64]
        sub             \sr2, \src, \s_strd, lsl #1
        add             \ds2, \dst, \d_strd
@ -1830,11 +1834,18 @@ L(\type\()_8tap_v_tbl):
        vmull_vmlal_8   q8,  d19, d20, d21, d22, d23, d24, d25, d26
        shift_store_4   \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
        ble             0f
+        cmp             \h,  #2
        vmov            q8,  q10
        vmov            q9,  q11
        vmov            q10, q12
        vmov            d22, d26
+        beq             46f
        b               48b
+46:
+        load_reg        \sr2, \src, \s_strd, d23, d24
+        vmull_vmlal_8   q1,  d16, d17, d18, d19, d20, d21, d22, d23
+        vmull_vmlal_8   q2,  d17, d18, d19, d20, d21, d22, d23, d24
+        shift_store_4   \type, \d_strd, q1, q2, d2, d3
 0:
        pop             {r4-r11,pc}

@ -2801,6 +2812,7 @@ L(\type\()_bilin_v_tbl):
        // 2x2 v
        vld1.32         {d16[]}, [\src], \s_strd
        bgt             24f
+22:
        vld1.32         {d17[]}, [\sr2], \s_strd
        vld1.32         {d18[]}, [\src], \s_strd
        vext.8          d16, d16, d17, #4
@ -2811,11 +2823,12 @@ L(\type\()_bilin_v_tbl):
        vst1.32         {d16[0]}, [\dst, :32]
        vst1.32         {d16[1]}, [\ds2, :32]
        pop             {r4-r11,pc}
-24:     // 2x4, 2x8, ... v
+24:     // 2x4, 2x6, 2x8, ... v
        vld1.32         {d17[]}, [\sr2], \s_strd
        vld1.32         {d18[]}, [\src], \s_strd
        vld1.32         {d19[]}, [\sr2], \s_strd
        vld1.32         {d20[]}, [\src], \s_strd
+        subs            \h,  \h,  #4
        vext.8          d16, d16, d17, #4
        vext.8          d17, d17, d18, #4
        vext.8          d18, d18, d19, #4
@ -2823,14 +2836,15 @@ L(\type\()_bilin_v_tbl):
        vswp            d17, d18
        vmul.i16        q8,  q8,  q2
        vmla.i16        q8,  q9,  q3
-        subs            \h,  \h,  #4
+        cmp             \h,  #2
        vrshr.u16       q8,  q8,  #4
        vst1.32         {d16[0]}, [\dst, :32], \d_strd
        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
        vst1.32         {d17[0]}, [\dst, :32], \d_strd
        vst1.32         {d17[1]}, [\ds2, :32], \d_strd
-        ble             0f
+        blt             0f
        vmov            d16, d20
+        beq             22b
        b               24b
 0:
        pop             {r4-r11,pc}
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -1163,6 +1163,26 @@ endfunc
 // Interleaving the mul/mla chains actually hurts performance
 // significantly on Cortex A53, thus keeping mul/mla tightly
 // chained like this.
+.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().4h, \s0\().4h, v0.h[0]
+        mla             \d0\().4h, \s1\().4h, v0.h[1]
+        mla             \d0\().4h, \s2\().4h, v0.h[2]
+        mla             \d0\().4h, \s3\().4h, v0.h[3]
+        mla             \d0\().4h, \s4\().4h, v0.h[4]
+        mla             \d0\().4h, \s5\().4h, v0.h[5]
+        mla             \d0\().4h, \s6\().4h, v0.h[6]
+        mla             \d0\().4h, \s7\().4h, v0.h[7]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().8h, \s0\().8h, v0.h[0]
+        mla             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
+.endm
 .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
        mul             \d0\().8h, \s0\().8h, v0.h[0]
        mla             \d0\().8h, \s1\().8h, v0.h[1]
@ -1199,24 +1219,6 @@ endfunc
        mla             \d1\().8h, \s8\().8h, v0.h[6]
        mla             \d1\().8h, \s9\().8h, v0.h[7]
 .endm
-.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
-        mul             \d0\().8h, \s0\().8h,  v0.h[0]
-        mla             \d0\().8h, \s1\().8h,  v0.h[1]
-        mla             \d0\().8h, \s2\().8h,  v0.h[2]
-        mla             \d0\().8h, \s3\().8h,  v0.h[3]
-        mla             \d0\().8h, \s4\().8h,  v0.h[4]
-        mla             \d0\().8h, \s5\().8h,  v0.h[5]
-        mla             \d0\().8h, \s6\().8h,  v0.h[6]
-        mla             \d0\().8h, \s7\().8h,  v0.h[7]
-        mul             \d1\().8h, \s4\().8h,  v0.h[0]
-        mla             \d1\().8h, \s5\().8h,  v0.h[1]
-        mla             \d1\().8h, \s6\().8h,  v0.h[2]
-        mla             \d1\().8h, \s7\().8h,  v0.h[3]
-        mla             \d1\().8h, \s8\().8h,  v0.h[4]
-        mla             \d1\().8h, \s9\().8h,  v0.h[5]
-        mla             \d1\().8h, \s10\().8h, v0.h[6]
-        mla             \d1\().8h, \s11\().8h, v0.h[7]
-.endm
 .macro sqrshrun_b shift, r0, r1, r2, r3
        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
 .ifnb \r1
@ -1633,7 +1635,7 @@ L(\type\()_8tap_v):
        st_h            \d_strd, v6, 4
        ret

-28:     // 2x8, 2x16 v
+28:     // 2x6, 2x8, 2x12, 2x16 v
        ld1             {v0.8b}, [\xmy]
        sub             \sr2,  \src,  \s_strd, lsl #1
        add             \ds2,  \dst,  \d_strd
@ -1648,28 +1650,32 @@ L(\type\()_8tap_v):
        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
        uxtl_b          v1,  v2,  v3,  v4
 216:
-        subs            \h,  \h,  #8
+        subs            \h,  \h,  #4
        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
-        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
        interleave_1_h  v7,  v16, v17, v18, v19
-        interleave_1_h  v19, v20, v21, v22, v23
        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
-        interleave_2_s  v17, v18, v19, v20, v21, v22
        uxtl_b          v5,  v6,  v7,  v16
-        uxtl_b          v17, v18, v19, v20
-        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
-        sqrshrun_b      6,   v30, v31
+        mul_mla_8_0     v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        sqrshrun_b      6,   v30
        st_h            \d_strd, v30, 4
-        st_h            \d_strd, v31, 4
        b.le            0f
-        mov             v1.16b,  v17.16b
-        mov             v2.16b,  v18.16b
-        mov             v3.16b,  v19.16b
-        mov             v4.16b,  v20.16b
-        mov             v5.16b,  v21.16b
-        mov             v6.16b,  v22.16b
-        mov             v7.16b,  v23.16b
+        cmp             \h,  #2
+        mov             v1.16b,  v5.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        mov             v4.16b,  v16.16b
+        mov             v5.16b,  v17.16b
+        mov             v6.16b,  v18.16b
+        mov             v7.16b,  v19.16b
+        b.eq            26f
        b               216b
+26:
+        load_h          \sr2, \src, \s_strd, v16, v17
+        interleave_1_h  v7,  v16, v17
+        uxtl_b          v5,  v6,  v7,  v16
+        mul_mla_8_0_4h  v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        sqrshrun_b      6,   v30
+        st_h            \d_strd, v30, 2
 0:
        ret
 .endif
@ -1703,7 +1709,7 @@ L(\type\()_8tap_v):
 0:
        ret

-480:    // 4x8, 4x16 v
+480:    // 4x6, 4x8, 4x12, 4x16 v
        ld1             {v0.8b}, [\xmy]
        sub             \sr2, \src, \s_strd, lsl #1
        add             \ds2, \dst, \d_strd
@ -1726,12 +1732,19 @@ L(\type\()_8tap_v):
        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
        shift_store_4   \type, \d_strd, v1, v2
        b.le            0f
-        subs            \h,  \h,  #4
-        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
-        interleave_1_s  v26, v27, v16, v17, v18
-        uxtl_b          v26, v27, v16, v17
-        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
-        shift_store_4   \type, \d_strd, v1, v2
+        load_s          \sr2,  \src, \s_strd, v27, v16
+        subs            \h,  \h,  #2
+        interleave_1_s  v26, v27, v16
+        uxtl_b          v26, v27
+        mul_mla_8_0     v1,  v20, v21, v22, v23, v24, v25, v26, v27
+        shift_store_4   \type, \d_strd, v1
+        b.le            0f
+        load_s          \sr2,  \src, \s_strd, v17, v18
+        subs            \h,  \h,  #2
+        interleave_1_s  v16, v17, v18
+        uxtl_b          v16, v17
+        mul_mla_8_0     v2,  v22, v23, v24, v25, v26, v27, v16, v17
+        shift_store_4   \type, \d_strd, v2
        b.le            0f
        subs            \h,  \h,  #4
        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
@ -2641,6 +2654,7 @@ L(\type\()_bilin_v):
        // 2x2 v
        ld1             {v16.h}[0], [\src], \s_strd
        b.gt            24f
+22:
        ld1             {v17.h}[0], [\sr2], \s_strd
        ld1             {v18.h}[0], [\src], \s_strd
        trn1            v16.4h, v16.4h, v17.4h
@ -2651,11 +2665,12 @@ L(\type\()_bilin_v):
        st1             {v4.h}[0], [\dst]
        st1             {v4.h}[1], [\ds2]
        ret
-24:     // 2x4, 2x8, ... v
+24:     // 2x4, 2x6, 2x8, ... v
        ld1             {v17.h}[0], [\sr2], \s_strd
        ld1             {v18.h}[0], [\src], \s_strd
        ld1             {v19.h}[0], [\sr2], \s_strd
        ld1             {v20.h}[0], [\src], \s_strd
+        sub             \h,  \h,  #4
        trn1            v16.4h, v16.4h, v17.4h
        trn1            v17.4h, v17.4h, v18.4h
        trn1            v18.4h, v18.4h, v19.4h
@ -2664,14 +2679,15 @@ L(\type\()_bilin_v):
        trn1            v17.2s, v17.2s, v19.2s
        umull           v4.8h,  v16.8b,  v2.8b
        umlal           v4.8h,  v17.8b,  v3.8b
-        subs            \h,  \h,  #4
+        cmp             \h,  #2
        uqrshrn         v4.8b,  v4.8h,  #4
        st1             {v4.h}[0], [\dst], \d_strd
        st1             {v4.h}[1], [\ds2], \d_strd
        st1             {v4.h}[2], [\dst], \d_strd
        st1             {v4.h}[3], [\ds2], \d_strd
-        b.le            0f
+        b.lt            0f
        mov             v16.8b, v20.8b
+        b.eq            22b
        b               24b
 0:
        ret
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@ -1801,7 +1801,7 @@ L(\type\()_8tap_v):
        st_s            \d_strd, v16, 4
        ret

-28:     // 2x8, 2x16 v
+28:     // 2x6, 2x8, 2x12, 2x16 v
        ld1             {v0.8b}, [\xmy]
        sub             \sr2,  \src,  \s_strd, lsl #1
        add             \ds2,  \dst,  \d_strd
@ -1814,28 +1814,32 @@ L(\type\()_8tap_v):
        interleave_1_s  v1,  v2,  v3,  v4,  v5
        interleave_1_s  v5,  v6,  v7
 216:
-        subs            \h,  \h,  #8
+        subs            \h,  \h,  #4
        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
-        load_s          \sr2, \src, \s_strd, v20, v21, v22, v23
        interleave_1_s  v7,  v16, v17, v18, v19
-        interleave_1_s  v19, v20, v21, v22, v23
        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
-        smull_smlal_8   v26, v5,  v6,  v7,  v16, v17, v18, v19, v20
-        smull_smlal_8   v27, v7,  v16, v17, v18, v19, v20, v21, v22
-        sqrshrun_h      6,   v24, v25, v26, v27
-        umin_h          v31, .8h, v24, v26
+        sqrshrun_h      6,   v24, v25
+        umin_h          v31, .8h, v24
        st_s            \d_strd, v24, 4
-        st_s            \d_strd, v26, 4
        b.le            0f
-        mov             v1.16b,  v17.16b
-        mov             v2.16b,  v18.16b
-        mov             v3.16b,  v19.16b
-        mov             v4.16b,  v20.16b
-        mov             v5.16b,  v21.16b
-        mov             v6.16b,  v22.16b
-        mov             v7.16b,  v23.16b
+        cmp             \h,  #2
+        mov             v1.16b,  v5.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        mov             v4.16b,  v16.16b
+        mov             v5.16b,  v17.16b
+        mov             v6.16b,  v18.16b
+        mov             v7.16b,  v19.16b
+        b.eq            26f
        b               216b
+26:
+        load_s          \sr2, \src, \s_strd, v16, v17
+        interleave_1_s  v7,  v16, v17
+        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        sqrshrun_h      6,   v24
+        umin_h          v31, .4h, v24
+        st_s            \d_strd, v24, 2
 0:
        ret
 .endif
@ -1867,7 +1871,7 @@ L(\type\()_8tap_v):
 0:
        ret

-480:    // 4x8, 4x16 v
+480:    // 4x6, 4x8, 4x12, 4x16 v
        ld1             {v0.8b}, [\xmy]
        sub             \sr2, \src, \s_strd, lsl #1
        add             \ds2, \dst, \d_strd
@ -1887,6 +1891,7 @@ L(\type\()_8tap_v):
        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
        shift_store_4   \type, \d_strd, v1, v2, v3, v4
        b.le            0f
+        cmp             \h,  #2
        mov             v16.8b,  v20.8b
        mov             v17.8b,  v21.8b
        mov             v18.8b,  v22.8b
@ -1894,7 +1899,13 @@ L(\type\()_8tap_v):
        mov             v20.8b,  v24.8b
        mov             v21.8b,  v25.8b
        mov             v22.8b,  v26.8b
+        b.eq            46f
        b               48b
+46:
+        load_4h         \sr2, \src, \s_strd, v23, v24
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        shift_store_4   \type, \d_strd, v1, v2
 0:
        ret

@ -2858,6 +2869,7 @@ L(\type\()_bilin_v):
        // 2x2 v
        ld1             {v16.s}[0], [\src], \s_strd
        b.gt            24f
+22:
        ld1             {v17.s}[0], [\sr2], \s_strd
        ld1             {v18.s}[0], [\src], \s_strd
        trn1            v16.2s,  v16.2s,  v17.2s
@ -2868,11 +2880,12 @@ L(\type\()_bilin_v):
        st1             {v4.s}[0], [\dst]
        st1             {v4.s}[1], [\ds2]
        ret
-24:     // 2x4, 2x8, ... v
+24:     // 2x4, 2x6, 2x8, ... v
        ld1             {v17.s}[0], [\sr2], \s_strd
        ld1             {v18.s}[0], [\src], \s_strd
        ld1             {v19.s}[0], [\sr2], \s_strd
        ld1             {v20.s}[0], [\src], \s_strd
+        sub             \h,  \h,  #4
        trn1            v16.2s,  v16.2s,  v17.2s
        trn1            v17.2s,  v17.2s,  v18.2s
        trn1            v18.2s,  v18.2s,  v19.2s
@ -2881,14 +2894,15 @@ L(\type\()_bilin_v):
        trn1            v17.2d,  v17.2d,  v19.2d
        mul             v4.8h,   v16.8h,  v2.8h
        mla             v4.8h,   v17.8h,  v3.8h
-        subs            \h,  \h,  #4
+        cmp             \h,  #2
        urshr           v4.8h,   v4.8h,   #4
        st1             {v4.s}[0], [\dst], \d_strd
        st1             {v4.s}[1], [\ds2], \d_strd
        st1             {v4.s}[2], [\dst], \d_strd
        st1             {v4.s}[3], [\ds2], \d_strd
-        b.le            0f
+        b.lt            0f
        mov             v16.8b,  v20.8b
+        b.eq            22b
        b               24b
 0:
        ret
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@ -49,16 +49,11 @@
 #endif

 static unsigned flags = 0;
-
-#if __has_feature(memory_sanitizer)
-// memory sanitizer is inherently incompatible with asm
-static unsigned flags_mask = 0;
-#else
 static unsigned flags_mask = -1;
-#endif

 COLD void dav1d_init_cpu(void) {
-#if HAVE_ASM
+#if HAVE_ASM && !__has_feature(memory_sanitizer)
+// memory sanitizer is inherently incompatible with asm
 #if ARCH_AARCH64 || ARCH_ARM
    flags = dav1d_get_cpu_flags_arm();
 #elif ARCH_PPC64LE
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -3295,6 +3295,15 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
    f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
    f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];

+    retval = 0;
+error:
+    return retval;
+}
+
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
+    const Dav1dContext *const c = f->c;
+    int retval = DAV1D_ERR(EINVAL);
+
    if (f->frame_hdr->refresh_context)
        dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);

@ -3430,6 +3439,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
    // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
    // threads also. Not sure it makes a measurable difference.
    int res = dav1d_decode_frame_init(f);
+    if (!res) res = dav1d_decode_frame_init_cdf(f);
    // wait until all threads have completed
    if (!res) {
        if (f->c->n_tc > 1) {
@ -3487,7 +3497,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
                atomic_fetch_add(&c->task_thread.first, 1U);
            else
                atomic_store(&c->task_thread.first, 0);
-            if (c->task_thread.cur < c->n_fc)
+            if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
                c->task_thread.cur--;
        }
        if (out_delayed->p.data[0]) {
@ -3496,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
            if ((out_delayed->visible || c->output_invisible_frames) &&
                progress != FRAME_ERROR)
            {
-                dav1d_picture_ref(&c->out, &out_delayed->p);
+                dav1d_thread_picture_ref(&c->out, out_delayed);
                c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
            }
            dav1d_thread_picture_unref(out_delayed);
@ -3670,7 +3680,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
    // move f->cur into output queue
    if (c->n_fc == 1) {
        if (f->frame_hdr->show_frame || c->output_invisible_frames) {
-            dav1d_picture_ref(&c->out, &f->sr_cur.p);
+            dav1d_thread_picture_ref(&c->out, &f->sr_cur);
            c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
        }
    } else {
@ -3822,7 +3832,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {

    if (c->n_fc == 1) {
        if ((res = dav1d_decode_frame(f)) < 0) {
-            dav1d_picture_unref_internal(&c->out);
+            dav1d_thread_picture_unref(&c->out);
            for (int i = 0; i < 8; i++) {
                if (refresh_frame_flags & (1 << i)) {
                    if (c->refs[i].p.p.data[0])
@ -3851,7 +3861,7 @@ error:
        dav1d_ref_dec(&f->ref_mvs_ref[i]);
    }
    if (c->n_fc == 1)
-        dav1d_picture_unref_internal(&c->out);
+        dav1d_thread_picture_unref(&c->out);
    else
        dav1d_thread_picture_unref(out_delayed);
    dav1d_picture_unref_internal(&f->cur);
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -102,7 +102,7 @@ struct Dav1dContext {

    // decoded output picture queue
    Dav1dData in;
-    Dav1dPicture out;
+    Dav1dThreadPicture out, cache;
    // dummy is a pointer to prevent compiler errors about atomic_load()
    // not taking const arguments
    atomic_int flush_mem, *flush;
@ -158,6 +158,7 @@ struct Dav1dContext {
    unsigned frame_size_limit;
    int strict_std_compliance;
    int output_invisible_frames;
+    enum Dav1dInloopFilterType inloop_filters;
    int drain;
    enum PictureFlags frame_flags;
    enum Dav1dEventFlags event_flags;
@ -169,14 +170,15 @@ struct Dav1dContext {

 enum TaskType {
    DAV1D_TASK_TYPE_INIT,
+    DAV1D_TASK_TYPE_INIT_CDF,
    DAV1D_TASK_TYPE_TILE_ENTROPY,
+    DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
    DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
    DAV1D_TASK_TYPE_DEBLOCK_COLS,
    DAV1D_TASK_TYPE_DEBLOCK_ROWS,
    DAV1D_TASK_TYPE_CDEF,
    DAV1D_TASK_TYPE_SUPER_RESOLUTION,
    DAV1D_TASK_TYPE_LOOP_RESTORATION,
-    DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
    DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
 };

@ -303,6 +305,7 @@ struct Dav1dFrameContext {
        struct TaskThreadData *ttd;
        struct Dav1dTask *tasks, *tile_tasks[2], init_task;
        int num_tasks, num_tile_tasks;
+        int init_done;
        int done[2];
        int retval;
        int update_set; // whether we need to update CDF reference
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -76,6 +76,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
    s->frame_size_limit = 0;
    s->strict_std_compliance = 0;
    s->output_invisible_frames = 0;
+    s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
 }

 static void close_internal(Dav1dContext **const c_out, int flush);
@ -131,6 +132,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
    c->frame_size_limit = s->frame_size_limit;
    c->strict_std_compliance = s->strict_std_compliance;
    c->output_invisible_frames = s->output_invisible_frames;
+    c->inloop_filters = s->inloop_filters;

    if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
        dav1d_mem_pool_init(&c->frame_hdr_pool) ||
@ -311,33 +313,46 @@ static int has_grain(const Dav1dPicture *const pic)
           fgdata->num_uv_points[1];
 }

-static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
-                        Dav1dPicture *const in)
+static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
 {
-    if (!c->apply_grain || !has_grain(in)) {
-        dav1d_picture_move_ref(out, in);
-        return 0;
+    int res = 0;
+
+    Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache;
+    if (!c->apply_grain || !has_grain(&in->p)) {
+        dav1d_picture_move_ref(out, &in->p);
+        dav1d_thread_picture_unref(in);
+        goto end;
    }

-    int res = dav1d_apply_grain(c, out, in);
-    dav1d_picture_unref_internal(in);
+    res = dav1d_apply_grain(c, out, &in->p);
+    dav1d_thread_picture_unref(in);
+end:
+    if (!c->all_layers && c->out.p.data[0]) {
+        dav1d_thread_picture_move_ref(in, &c->out);
+    }
    return res;
 }

-static int output_picture_ready(Dav1dContext *const c) {
-
-    if (!c->out.data[0]) return 0;
-
-    // skip lower spatial layers
-    if (c->operating_point_idc && !c->all_layers) {
-        const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
-        if (max_spatial_id > c->out.frame_hdr->spatial_id) {
-            dav1d_picture_unref_internal(&c->out);
+static int output_picture_ready(Dav1dContext *const c, const int drain) {
+    if (!c->all_layers) {
+        if (c->out.p.data[0] && c->cache.p.data[0]) {
+            const unsigned spatial_mask = c->operating_point_idc >> 8;
+            const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
+            if (max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
+                c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
+                return 1;
+            dav1d_thread_picture_unref(&c->cache);
+            dav1d_thread_picture_move_ref(&c->cache, &c->out);
+            return 0;
+        } else if (c->cache.p.data[0] && drain) {
+            return 1;
+        } else if (c->out.p.data[0]) {
+            dav1d_thread_picture_move_ref(&c->cache, &c->out);
            return 0;
        }
    }

-    return 1;
+    return !!c->out.p.data[0];
 }

 static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
@ -369,15 +384,18 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
            if ((out_delayed->visible || c->output_invisible_frames) &&
                progress != FRAME_ERROR)
            {
-                dav1d_picture_ref(&c->out, &out_delayed->p);
+                dav1d_thread_picture_ref(&c->out, out_delayed);
                c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
            }
            dav1d_thread_picture_unref(out_delayed);
-            if (output_picture_ready(c))
-                return output_image(c, out, &c->out);
+            if (output_picture_ready(c, 0))
+                return output_image(c, out);
        }
    } while (++drain_count < c->n_fc);

+    if (output_picture_ready(c, 1))
+        return output_image(c, out);
+
    return DAV1D_ERR(EAGAIN);
 }

@ -386,7 +404,7 @@ static int gen_picture(Dav1dContext *const c)
    int res;
    Dav1dData *const in = &c->in;

-    if (output_picture_ready(c))
+    if (output_picture_ready(c, 0))
        return 0;

    while (in->sz > 0) {
@ -399,7 +417,7 @@ static int gen_picture(Dav1dContext *const c)
            in->data += res;
            if (!in->sz) dav1d_data_unref_internal(in);
        }
-        if (output_picture_ready(c))
+        if (output_picture_ready(c, 0))
            break;
        if (res < 0)
            return res;
@ -439,8 +457,8 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
    if (res < 0)
        return res;

-    if (output_picture_ready(c))
-        return output_image(c, out, &c->out);
+    if (output_picture_ready(c, c->n_fc == 1))
+        return output_image(c, out);

    if (c->n_fc > 1 && drain)
        return drain_picture(c, out);
@ -592,6 +610,8 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
            freep(&f->frame_thread.tile_start_off);
            dav1d_freep_aligned(&f->frame_thread.pal);
            freep(&f->frame_thread.cbi);
+        }
+        if (c->n_tc > 1) {
            pthread_cond_destroy(&f->task_thread.cond);
        }
        freep(&f->frame_thread.frame_progress);
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -260,29 +260,38 @@ endif



+libdav1d_rc_obj = []
+libdav1d_flags = [stackalign_flag]
 api_export_flags = []

 #
 # Windows .rc file and API export flags
 #

-if host_machine.system() == 'windows' and get_option('default_library') != 'static'
-    rc_file = configure_file(
-        input : 'dav1d.rc.in',
-        output : 'dav1d.rc',
-        configuration : rc_data
-    )
+if host_machine.system() == 'windows'
+    if get_option('default_library') != 'static'
+        rc_file = configure_file(
+            input : 'dav1d.rc.in',
+            output : 'dav1d.rc',
+            configuration : rc_data
+        )

-    libdav1d_rc_obj = winmod.compile_resources(rc_file)
+        libdav1d_rc_obj = winmod.compile_resources(rc_file)

-    api_export_flags = ['-DDAV1D_BUILDING_DLL']
-else
-    libdav1d_rc_obj = []
+        api_export_flags = ['-DDAV1D_BUILDING_DLL']
+    endif
+
+    if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
+        # We don't expect to reference data members from other DLLs without
+        # dllimport attributes. Set the -mcmodel=small flag, which avoids
+        # generating indirection via .refptr.<symname> for all potentially
+        # dllimported variable references.
+        libdav1d_flags += '-mcmodel=small'
+    endif
 endif



-
 #
 # Library definitions
 #
@ -294,7 +303,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',

    include_directories : dav1d_inc_dirs,
    dependencies: [stdatomic_dependencies],
-    c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
+    c_args : [libdav1d_flags, stackrealign_flag, api_export_flags],
    install : false,
    build_by_default : false,
 ).extract_all_objects(recursive: true)
@ -307,7 +316,7 @@ foreach bitdepth : dav1d_bitdepths
        libdav1d_tmpl_sources, config_h_target,
        include_directories: dav1d_inc_dirs,
        dependencies : [stdatomic_dependencies],
-        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
+        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags,
        install : false,
        build_by_default : false,
    ).extract_all_objects(recursive: true)
@ -320,7 +329,7 @@ foreach bitdepth : dav1d_bitdepths
        libdav1d_arch_tmpl_sources, config_h_target,
        include_directories: dav1d_inc_dirs,
        dependencies : [stdatomic_dependencies],
-        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
+        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
        install : false,
        build_by_default : false,
    ).extract_all_objects(recursive: true)
@ -350,7 +359,7 @@ libdav1d = library('dav1d',
        thread_compat_dep,
        libdl_dependency,
        ],
-    c_args : [stackalign_flag, api_export_flags],
+    c_args : [libdav1d_flags, api_export_flags],
    version : dav1d_soname_version,
    soversion : dav1d_soversion,
    install : true,
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -1533,8 +1533,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa

        break;
    }
-    case DAV1D_OBU_PADDING:
    case DAV1D_OBU_TD:
+        c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
+        break;
+    case DAV1D_OBU_PADDING:
        // ignore OBUs we don't care about
        break;
    default:
@ -1547,9 +1549,9 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
        if (c->frame_hdr->show_existing_frame) {
            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
            if (c->n_fc == 1) {
-                dav1d_picture_ref(&c->out,
-                                  &c->refs[c->frame_hdr->existing_frame_idx].p.p);
-                dav1d_data_props_copy(&c->out.m, &in->m);
+                dav1d_thread_picture_ref(&c->out,
+                                         &c->refs[c->frame_hdr->existing_frame_idx].p);
+                dav1d_data_props_copy(&c->out.p.m, &in->m);
                c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
            } else {
                pthread_mutex_lock(&c->task_thread.lock);
@ -1569,7 +1571,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
                        atomic_fetch_add(&c->task_thread.first, 1U);
                    else
                        atomic_store(&c->task_thread.first, 0);
-                    if (c->task_thread.cur < c->n_fc)
+                    if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
                        c->task_thread.cur--;
                }
                if (out_delayed->p.data[0]) {
@ -1578,7 +1580,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
                    if ((out_delayed->visible || c->output_invisible_frames) &&
                        progress != FRAME_ERROR)
                    {
-                        dav1d_picture_ref(&c->out, &out_delayed->p);
+                        dav1d_thread_picture_ref(&c->out, out_delayed);
                        c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
                    }
                    dav1d_thread_picture_unref(out_delayed);
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -259,6 +259,16 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
    dst->flags = src->flags;
 }

+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
+                                   Dav1dThreadPicture *const src)
+{
+    dav1d_picture_move_ref(&dst->p, &src->p);
+    dst->visible = src->visible;
+    dst->progress = src->progress;
+    dst->flags = src->flags;
+    memset(src, 0, sizeof(*src));
+}
+
 void dav1d_picture_unref_internal(Dav1dPicture *const p) {
    validate_input(p != NULL);

--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@ -46,6 +46,7 @@ enum PlaneType {
 enum PictureFlags {
    PICTURE_FLAG_NEW_SEQUENCE =       1 << 0,
    PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1,
+    PICTURE_FLAG_NEW_TEMPORAL_UNIT  = 1 << 2,
 };

 typedef struct Dav1dThreadPicture {
@ -83,6 +84,8 @@ int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
 void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
 void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
                              const Dav1dThreadPicture *src);
+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst,
+                                   Dav1dThreadPicture *src);
 void dav1d_thread_picture_unref(Dav1dThreadPicture *p);

 /**
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -2046,6 +2046,11 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
 }

 void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
+    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
+        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
+    {
+        return;
+    }
    const int y = sby * f->sb_step * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const p[3] = {
@ -2054,9 +2059,8 @@ void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const i
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
-    if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])
-        bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
-                                            f->lf.start_of_tile_row[sby]);
+    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
+                                        f->lf.start_of_tile_row[sby]);
 }

 void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
@ -2068,7 +2072,9 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
-    if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
+    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
+        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
+    {
        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
    }
    if (f->seq_hdr->cdef || f->lf.restore_planes) {
@ -2079,6 +2085,7 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i

 void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
    const Dav1dFrameContext *const f = tc->f;
+    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
    const int sbsz = f->sb_step;
    const int y = sby * sbsz * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -2140,6 +2147,7 @@ void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby
 }

 void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
+    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
    const int y = sby * f->sb_step * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const sr_p[3] = {
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -141,7 +141,8 @@ static void insert_tasks(Dav1dFrameContext *const f,
        }

        // sort by tile-id
-        assert(first->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
+        assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
+               first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
        assert(first->type == t_ptr->type);
        assert(t_ptr->sby == first->sby);
        const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
@ -270,6 +271,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
 void dav1d_task_frame_init(Dav1dFrameContext *const f) {
    const Dav1dContext *const c = f->c;

+    f->task_thread.init_done = 0;
    // schedule init task, which will schedule the remaining tasks
    Dav1dTask *const t = &f->task_thread.init_task;
    t->type = DAV1D_TASK_TYPE_INIT;
@ -350,6 +352,18 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
    return 0;
 }

+static inline void abort_frame(Dav1dFrameContext *const f) {
+    atomic_store(&f->task_thread.error, 1);
+    f->task_thread.task_counter = 0;
+    f->task_thread.done[0] = 1;
+    f->task_thread.done[1] = 1;
+    atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
+    atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
+    dav1d_decode_frame_exit(f, -1);
+    f->n_tile_data = 0;
+    pthread_cond_signal(&f->task_thread.cond);
+}
+
 void *dav1d_worker_task(void *data) {
    Dav1dTaskContext *const tc = data;
    const Dav1dContext *const c = tc->c;
@ -360,23 +374,37 @@ void *dav1d_worker_task(void *data) {
    pthread_mutex_lock(&ttd->lock);
    for (;;) {
        Dav1dFrameContext *f;
-        Dav1dTask *t, *prev_t;
+        Dav1dTask *t, *prev_t = NULL;
        if (tc->task_thread.die) break;
        if (atomic_load(c->flush)) goto park;
-        while (ttd->cur < c->n_fc) {
-            const unsigned first = atomic_load(&ttd->first);
-            f = &c->fc[(first + ttd->cur) % c->n_fc];
-            prev_t = f->task_thread.task_cur_prev;
-            t = prev_t ? prev_t->next : f->task_thread.task_head;
-            while (t) {
-                if (t->type == DAV1D_TASK_TYPE_INIT) {
+        if (c->n_fc > 1) { // run init tasks first
+            for (unsigned i = 0; i < c->n_fc; i++) {
+                const unsigned first = atomic_load(&ttd->first);
+                f = &c->fc[(first + i) % c->n_fc];
+                if (f->task_thread.init_done) continue;
+                t = f->task_thread.task_head;
+                if (!t) continue;
+                if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
+                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
                    const int p1 = f->in_cdf.progress ?
                        atomic_load(f->in_cdf.progress) : 1;
                    if (p1) {
                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
                        goto found;
                    }
-                } else if (t->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) {
+                }
+            }
+        }
+        while (ttd->cur < c->n_fc) {
+            const unsigned first = atomic_load(&ttd->first);
+            f = &c->fc[(first + ttd->cur) % c->n_fc];
+            prev_t = f->task_thread.task_cur_prev;
+            t = prev_t ? prev_t->next : f->task_thread.task_head;
+            while (t) {
+                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
+                else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
+                         t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
+                {
                    // if not bottom sbrow of tile, this task will be re-added
                    // after it's finished
                    if (!check_tile(t, f, c->n_fc > 1))
@ -447,7 +475,8 @@ void *dav1d_worker_task(void *data) {
        if (prev_t) prev_t->next = t->next;
        else f->task_thread.task_head = t->next;
        if (!t->next) f->task_thread.task_tail = prev_t;
-        if (!f->task_thread.task_head) ttd->cur++;
+        if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
+            ttd->cur++;
        // we don't need to check cond_signaled here, since we found a task
        // after the last signal so we want to re-signal the next waiting thread
        // and again won't need to signal after that
@ -463,10 +492,26 @@ void *dav1d_worker_task(void *data) {
        int sby = t->sby;
        switch (t->type) {
        case DAV1D_TASK_TYPE_INIT: {
+            assert(c->n_fc > 1);
+            int res = dav1d_decode_frame_init(f);
+            int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
+            if (res || p1 == TILE_ERROR) {
+                pthread_mutex_lock(&ttd->lock);
+                abort_frame(f);
+            } else if (!res) {
+                t->type = DAV1D_TASK_TYPE_INIT_CDF;
+                if (p1) goto found_unlocked;
+                pthread_mutex_lock(&ttd->lock);
+                insert_task(f, t, 0);
+            }
+            reset_task_cur(c, ttd, t->frame_idx);
+            continue;
+        }
+        case DAV1D_TASK_TYPE_INIT_CDF: {
            assert(c->n_fc > 1);
            int res = -1;
            if (!atomic_load(&f->task_thread.error))
-                res = dav1d_decode_frame_init(f);
+                res = dav1d_decode_frame_init_cdf(f);
            pthread_mutex_lock(&ttd->lock);
            if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
                atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
@ -490,19 +535,9 @@ void *dav1d_worker_task(void *data) {
                        }
                    }
                }
-            } else {
-                // init failed, signal completion
-                atomic_store(&f->task_thread.error, 1);
-                f->task_thread.task_counter = 0;
-                f->task_thread.done[0] = 1;
-                f->task_thread.done[1] = 1;
-                atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
-                atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
-                dav1d_decode_frame_exit(f, -1);
-                f->n_tile_data = 0;
-                pthread_cond_signal(&f->task_thread.cond);
-            }
+            } else abort_frame(f);
            reset_task_cur(c, ttd, t->frame_idx);
+            f->task_thread.init_done = 1;
            continue;
        }
        case DAV1D_TASK_TYPE_TILE_ENTROPY:
--- a/third_party/dav1d/src/thread_task.h
+++ b/third_party/dav1d/src/thread_task.h
@ -42,6 +42,7 @@ void dav1d_task_frame_init(Dav1dFrameContext *f);
 void *dav1d_worker_task(void *data);

 int dav1d_decode_frame_init(Dav1dFrameContext *f);
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f);
 int dav1d_decode_frame_main(Dav1dFrameContext *f);
 void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
 int dav1d_decode_frame(Dav1dFrameContext *f);
--- a/third_party/dav1d/src/x86/cpu.c
+++ b/third_party/dav1d/src/x86/cpu.c
@ -28,13 +28,14 @@
 #include "config.h"

 #include <stdint.h>
+#include <string.h>

 #include "common/attributes.h"

 #include "src/x86/cpu.h"

 typedef struct {
-    uint32_t eax, ebx, ecx, edx;
+    uint32_t eax, ebx, edx, ecx;
 } CpuidRegisters;

 void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
@ -43,13 +44,22 @@ uint64_t dav1d_cpu_xgetbv(unsigned xcr);
 #define X(reg, mask) (((reg) & (mask)) == (mask))

 COLD unsigned dav1d_get_cpu_flags_x86(void) {
-    CpuidRegisters r = { 0 };
-    dav1d_cpu_cpuid(&r, 0, 0);
-    const unsigned max_leaf = r.eax;
+    union {
+        CpuidRegisters r;
+        struct {
+            uint32_t max_leaf;
+            char vendor[12];
+        };
+    } cpu;
+    dav1d_cpu_cpuid(&cpu.r, 0, 0);
    unsigned flags = 0;

-    if (max_leaf >= 1) {
+    if (cpu.max_leaf >= 1) {
+        CpuidRegisters r;
        dav1d_cpu_cpuid(&r, 1, 0);
+        const unsigned model  = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
+        const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
+
        if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
            flags |= DAV1D_X86_CPU_FLAG_SSE2;
            if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
@ -63,7 +73,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
        if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
            const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
            if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
-                if (max_leaf >= 7) {
+                if (cpu.max_leaf >= 7) {
                    dav1d_cpu_cpuid(&r, 7, 0);
                    if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
                        flags |= DAV1D_X86_CPU_FLAG_AVX2;
@ -76,6 +86,14 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
            }
        }
 #endif
+        if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
+            if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
+                (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
+            {
+                /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
+                flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
+            }
+        }
    }

    return flags;
--- a/third_party/dav1d/src/x86/cpu.h
+++ b/third_party/dav1d/src/x86/cpu.h
@ -29,12 +29,14 @@
 #define DAV1D_SRC_X86_CPU_H

 enum CpuFlags {
-    DAV1D_X86_CPU_FLAG_SSE2      = 1 << 0,
-    DAV1D_X86_CPU_FLAG_SSSE3     = 1 << 1,
-    DAV1D_X86_CPU_FLAG_SSE41     = 1 << 2,
-    DAV1D_X86_CPU_FLAG_AVX2      = 1 << 3,
-    DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
-                                            * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+    DAV1D_X86_CPU_FLAG_SSE2        = 1 << 0,
+    DAV1D_X86_CPU_FLAG_SSSE3       = 1 << 1,
+    DAV1D_X86_CPU_FLAG_SSE41       = 1 << 2,
+    DAV1D_X86_CPU_FLAG_AVX2        = 1 << 3,
+    DAV1D_X86_CPU_FLAG_AVX512ICL   = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+                                              * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+    DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
+                                              * to cause performance regressions. */
 };

 unsigned dav1d_get_cpu_flags_x86(void);
--- a/third_party/dav1d/src/x86/cpuid.asm
+++ b/third_party/dav1d/src/x86/cpuid.asm
@ -38,8 +38,8 @@ cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
    cpuid
    mov  [r4+4*0], eax
    mov  [r4+4*1], ebx
-    mov  [r4+4*2], ecx
-    mov  [r4+4*3], edx
+    mov  [r4+4*2], edx
+    mov  [r4+4*3], ecx
 %if ARCH_X86_64
    mov       rbx, r5
 %endif
--- a/third_party/dav1d/src/x86/film_grain16_avx2.asm
+++ b/third_party/dav1d/src/x86/film_grain16_avx2.asm
--- a/third_party/dav1d/src/x86/film_grain_avx2.asm
+++ b/third_party/dav1d/src/x86/film_grain_avx2.asm
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/film_grain_init_tmpl.c
@ -65,10 +65,13 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c

    c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
-    c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
-    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+    if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return;
+
+    c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
 #endif
--- a/third_party/dav1d/src/x86/itx16_avx2.asm
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@ -313,7 +313,7 @@ ALIGN function_align
 %endmacro

 %macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
-    INV_TXFM_FN    %1, %2, 0, 4x4, 12
+    INV_TXFM_FN          %1, %2, 0, 4x4, 12
 %ifidn %1_%2, dct_dct
    imul                r6d, [cq], 181
    mov                [cq], eobd ; 0
@ -340,21 +340,20 @@ ALIGN function_align
 %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
    vpbroadcastd        m%5, [pw_m3784_1567]
    punpckhwd           m%3, m%2, m%1
-    psubw               m%4, m%1, m%2
-    paddw               m%1, m%2
-    vpbroadcastd        m%2, [pw_1567_3784]
-    punpcklqdq          m%1, m%4
-    vpbroadcastd        m%4, [pw_2896x8]
+    vpbroadcastd        m%4, [pw_1567_3784]
+    punpcklwd           m%2, m%1
+    vpbroadcastd        m%1, [pw_m2896_2896]
    pmaddwd             m%5, m%3
-    pmaddwd             m%3, m%2
-    pmulhrsw            m%1, m%4      ; t0 t1
-    paddd               m%5, m%6
-    paddd               m%3, m%6
-    psrad               m%5, 12
-    psrad               m%3, 12
+    pmaddwd             m%3, m%4
+    vpbroadcastd        m%4, [pw_2896_2896]
+    pmaddwd             m%1, m%2
+    pmaddwd             m%2, m%4
+    REPX     {paddd x, m%6}, m%5, m%3, m%1, m%2
+    REPX     {psrad x, 12 }, m%5, m%3, m%1, m%2
    packssdw            m%3, m%5      ; t3 t2
-    psubsw              m%2, m%1, m%3 ; out3 out2
-    paddsw              m%1, m%3      ; out0 out1
+    packssdw            m%2, m%1      ; t0 t1
+    paddsw              m%1, m%2, m%3 ; out0 out1
+    psubsw              m%2, m%3      ; out3 out2
 %endmacro

 INV_TXFM_4X4_FN dct, dct
@ -2581,6 +2580,33 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
    lea                dstq, [dstq+strideq*2]
    ret

+%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
+    punpckldq            m%9,  m%1,  m%2 ; aibj emfn
+    punpckhdq            m%1,  m%2       ; ckdl gohp
+    punpckldq           m%10,  m%3,  m%4 ; qyrz uCvD
+    punpckhdq            m%3,  m%4       ; sAtB wExF
+    punpckldq           m%11,  m%5,  m%6 ; GOHP KSLT
+    punpckhdq            m%5,  m%6       ; IQJR MUNV
+    punpckldq           m%12,  m%7,  m%8 ; WeXf aibj
+    punpckhdq            m%7,  m%8       ; YgZh ckdl
+    punpcklqdq           m%2,  m%9, m%10 ; aiqy emuC
+    punpckhqdq           m%9, m%10       ; bjrz fnvD
+    punpcklqdq           m%4,  m%1,  m%3 ; cksA gowE
+    punpckhqdq          m%10,  m%1,  m%3 ; dltB hpxF
+    punpcklqdq           m%6, m%11, m%12 ; GOWe KSai
+    punpckhqdq          m%11, m%12       ; HPXf LTbj
+    punpcklqdq           m%8,  m%5,  m%7 ; IQYg MUck
+    punpckhqdq          m%12,  m%5,  m%7 ; JRZh NVdl
+    vperm2i128           m%1,  m%2,  m%6, 0x20   ; out0
+    vperm2i128           m%5,  m%2,  m%6, 0x31   ; out4
+    vperm2i128           m%2,  m%9, m%11, 0x20   ; out1
+    vperm2i128           m%6,  m%9, m%11, 0x31   ; out5
+    vperm2i128           m%3,  m%4,  m%8, 0x20   ; out2
+    vperm2i128           m%7,  m%4,  m%8, 0x31   ; out6
+    vperm2i128           m%4, m%10, m%12, 0x20   ; out3
+    vperm2i128           m%8, m%10, m%12, 0x31   ; out7
+%endmacro
+
 INV_TXFM_8X8_FN dct, dct,      12
 INV_TXFM_8X8_FN dct, identity, 12
 INV_TXFM_8X8_FN dct, adst,     12
@ -2608,30 +2634,7 @@ ALIGN function_align
    ret
 ALIGN function_align
 .transpose_8x8:
-    punpckldq            m8,  m0,  m1 ; aibj emfn
-    punpckhdq            m0,  m0,  m1 ; ckdl gohp
-    punpckldq            m9,  m2,  m3 ; qyrz uCvD
-    punpckhdq            m2,  m2,  m3 ; sAtB wExF
-    punpckldq           m10,  m4,  m5 ; GOHP KSLT
-    punpckhdq            m4,  m4,  m5 ; IQJR MUNV
-    punpckldq           m11,  m6,  m7 ; WeXf aibj
-    punpckhdq            m6,  m6,  m7 ; YgZh ckdl
-    punpcklqdq           m1,  m8,  m9 ; aiqy emuC
-    punpckhqdq           m8,  m8,  m9 ; bjrz fnvD
-    punpcklqdq           m3,  m0,  m2 ; cksA gowE
-    punpckhqdq           m9,  m0,  m2 ; dltB hpxF
-    punpcklqdq           m5, m10, m11 ; GOWe KSai
-    punpckhqdq          m10, m10, m11 ; HPXf LTbj
-    punpcklqdq           m7,  m4,  m6 ; IQYg MUck
-    punpckhqdq          m11,  m4,  m6 ; JRZh NVdl
-    vperm2i128           m0,  m1,  m5, 0x20   ; out0
-    vperm2i128           m4,  m1,  m5, 0x31   ; out4
-    vperm2i128           m1,  m8, m10, 0x20   ; out1
-    vperm2i128           m5,  m8, m10, 0x31   ; out5
-    vperm2i128           m2,  m3,  m7, 0x20   ; out2
-    vperm2i128           m6,  m3,  m7, 0x31   ; out6
-    vperm2i128           m3,  m9, m11, 0x20   ; out3
-    vperm2i128           m7,  m9, m11, 0x31   ; out7
+    TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
    ret
 ALIGN function_align
 .round_shift4:
@ -3336,6 +3339,21 @@ INV_TXFM_8X16_FN identity, identity, 0, 12
 cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
    jmp m(iidentity_8x16_internal_10bpc).pass1
 .pass2:
+    call .pass2_main
+    packssdw             m0, m8
+    packssdw             m1, m9
+    packssdw             m2, m10
+    packssdw             m3, m11
+    packssdw             m4, m12
+    packssdw             m5, m13
+    packssdw             m6, m14
+    packssdw            m13, m7, m15
+    vpbroadcastd         m7, [pixel_12bpc_max]
+    vpbroadcastd        m12, [pw_16384]
+    call m(iidentity_8x16_internal_10bpc).pass2_end
+    RET
+ALIGN function_align
+.pass2_main:
    mova               [cq], m7
    vpbroadcastd         m7, [clip_18b_min]
    REPX     {pmaxsd x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
@ -3358,18 +3376,7 @@ cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
    paddd               m15, [cq]
    REPX     {psrad  x, 15}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
                             m8,  m9,  m10, m11, m12, m13, m14, m15
-    packssdw             m0, m8
-    packssdw             m1, m9
-    packssdw             m2, m10
-    packssdw             m3, m11
-    packssdw             m4, m12
-    packssdw             m5, m13
-    packssdw             m6, m14
-    packssdw            m13, m7, m15
-    vpbroadcastd         m7, [pixel_12bpc_max]
-    vpbroadcastd        m12, [pw_16384]
-    call m(iidentity_8x16_internal_10bpc).pass2_end
-    RET
+    ret

 %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
    INV_TXFM_FN          %1, %2, 0, 16x4, %3
@ -4481,15 +4488,15 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
    call m(idct_16x8_internal_10bpc).write_16x4_zero
    jmp m(idct_16x8_internal_10bpc).end2

-%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
-    INV_TXFM_FN          %1, %2, %3, 16x16
+%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+    INV_TXFM_FN          %1, %2, %3, 16x16, %4
 %ifidn %1_%2, dct_dct
    imul                r6d, [cq], 2896
    mov                [cq], eobd ; 0
    mov                 r3d, 16
    add                 r6d, 10240
    sar                 r6d, 14
-    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+    jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2
 %endif
 %endmacro

@ -4499,9 +4506,10 @@ INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst

 cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
-    vpbroadcastd        m11, [pd_2048]
    vpbroadcastd        m12, [clip_18b_min]
    vpbroadcastd        m13, [clip_18b_max]
+.pass1:
+    vpbroadcastd        m11, [pd_2048]
    vpbroadcastd        m14, [pd_2896]
    lea                  r6, [rsp+32*4]
    sub                eobd, 36
@ -4605,6 +4613,7 @@ ALIGN function_align
    pmulhrsw             m2, m12
    pmulhrsw             m3, m12
    call m(idct_16x8_internal_10bpc).write_16x4_start
+.write_16x16_2:
    pmulhrsw             m0, m12, m4
    pmulhrsw             m1, m12, m5
    pmulhrsw             m2, m12, m6
@ -4747,6 +4756,7 @@ INV_TXFM_16X16_FN adst, flipadst
 cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
    vpbroadcastd        m13, [clip_18b_min]
    vpbroadcastd        m14, [clip_18b_max]
+.pass1:
    vpbroadcastd        m15, [pd_2896]
    lea                  r6, [rsp+32*4]
    sub                eobd, 36
@ -4882,6 +4892,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
 cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
    vpbroadcastd        m13, [clip_18b_min]
    vpbroadcastd        m14, [clip_18b_max]
+.pass1:
    vpbroadcastd        m15, [pd_2896]
    lea                  r6, [rsp+32*4]
    sub                eobd, 36
@ -4993,6 +5004,7 @@ INV_TXFM_16X16_FN identity, dct, -92
 INV_TXFM_16X16_FN identity, identity

 cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+.pass1:
    vpbroadcastd        m15, [pd_11586]
    vpbroadcastd         m7, [pd_10240]
    lea                  r6, [rsp+32*4]
@ -5056,6 +5068,375 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
    mova                 m1, [cq+32*1]
    jmp m(idct_16x16_internal_10bpc).end

+INV_TXFM_16X16_FN dct, dct,       0, 12
+INV_TXFM_16X16_FN dct, identity, 28, 12
+INV_TXFM_16X16_FN dct, adst,      0, 12
+INV_TXFM_16X16_FN dct, flipadst,  0, 12
+
+cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+    vpbroadcastd        m12, [clip_20b_min]
+    vpbroadcastd        m13, [clip_20b_max]
+    jmp m(idct_16x16_internal_10bpc).pass1
+.pass2:
+    mova         [cq+32* 8], m8
+    mova         [cq+32* 9], m9
+    mova         [cq+32*10], m10
+    mova         [cq+32*11], m11
+    mova         [cq+32*12], m12
+    mova         [cq+32*13], m13
+    mova         [cq+32*14], m14
+    mova         [cq+32*15], m15
+    call .pass2_main
+    packssdw             m0,  m1
+    packssdw             m1,  m2,  m3
+    packssdw             m2,  m4,  m5
+    packssdw             m3,  m6,  m7
+    packssdw             m4,  m8,  m9
+    packssdw             m5, m10, m11
+    packssdw             m6, m12, m13
+    packssdw             m7, m14, m15
+    mova          [r6-32*4], m0
+    mova          [r6-32*3], m1
+    mova          [r6-32*2], m2
+    mova          [r6-32*1], m3
+    mova          [r6+32*0], m4
+    mova          [r6+32*1], m5
+    mova          [r6+32*2], m6
+    mova          [r6+32*3], m7
+    mova                 m0, [cq+32* 8]
+    mova                 m1, [cq+32* 9]
+    mova                 m2, [cq+32*10]
+    mova                 m3, [cq+32*11]
+    mova                 m4, [cq+32*12]
+    mova                 m5, [cq+32*13]
+    mova                 m6, [cq+32*14]
+    mova                 m7, [cq+32*15]
+    mov                  r5, r6
+    add                  r6, 32*16
+    call .pass2_main
+    jmp m(iadst_16x16_internal_12bpc).end
+ALIGN function_align
+.write_16x16:
+    mova [rsp+gprsize+32*0], m8
+    mova [rsp+gprsize+32*1], m9
+    mova [rsp+gprsize+32*2], m12
+    vpbroadcastd        m12, [pw_16384]
+    pmulhrsw             m0, m12
+    pmulhrsw             m1, m12
+    pmulhrsw             m2, m12
+    pmulhrsw             m3, m12
+    call m(idct_16x8_internal_12bpc).write_16x4_start
+    call m(idct_16x8_internal_10bpc).write_16x4_zero
+    jmp m(idct_16x16_internal_10bpc).write_16x16_2
+ALIGN function_align
+.pass2_main:
+    call m(idct_8x8_internal_12bpc).transpose_8x8
+    mova         [cq+32* 0], m0
+    mova         [cq+32* 1], m2
+    mova         [cq+32* 2], m4
+    mova         [cq+32* 3], m6
+    vpbroadcastd        m12, [clip_18b_min]
+    vpbroadcastd        m13, [clip_18b_max]
+    pmaxsd               m0, m12, m1
+    pmaxsd               m1, m12, m3
+    pmaxsd               m2, m12, m5
+    pmaxsd               m3, m12, m7
+    REPX    {pminsd x, m13}, m0, m1, m2, m3
+    test               eobd, eobd
+    jge .pass2_slow
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    jmp .pass2_fast
+.pass2_slow:
+    sub                  r6, 32*8
+    mova                 m8, [r6-32*4]
+    mova                 m4, [r6-32*3]
+    mova                m10, [r6-32*2]
+    mova                 m5, [r6-32*1]
+    mova                m12, [r6+32*0]
+    mova                 m6, [r6+32*1]
+    mova                m14, [r6+32*2]
+    mova                 m7, [r6+32*3]
+    TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
+    mova         [cq+32* 4], m8
+    mova         [cq+32* 5], m10
+    mova         [cq+32* 6], m12
+    mova         [cq+32* 7], m14
+    vpbroadcastd        m12, [clip_18b_min]
+    vpbroadcastd        m13, [clip_18b_max]
+    REPX    {pmaxsd x, m12}, m4, m5, m6, m7
+    REPX    {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast:
+    vpbroadcastd        m11, [pd_2048]
+    vpbroadcastd        m14, [pd_2896]
+    call m(idct_8x16_internal_10bpc).main_oddhalf
+    pmaxsd               m0, m12, [cq+32* 0]
+    pmaxsd               m1, m12, [cq+32* 1]
+    pmaxsd               m2, m12, [cq+32* 2]
+    pmaxsd               m3, m12, [cq+32* 3]
+    REPX    {pminsd x, m13}, m0, m1, m2, m3
+    test               eobd, eobd
+    jge .pass2_slow2
+    pxor                 m4, m4
+    REPX       {mova x, m4}, m5, m6, m7
+    jmp .pass2_fast2
+.pass2_slow2:
+    pmaxsd               m4, m12, [cq+32* 4]
+    pmaxsd               m5, m12, [cq+32* 5]
+    pmaxsd               m6, m12, [cq+32* 6]
+    pmaxsd               m7, m12, [cq+32* 7]
+    REPX    {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast2:
+    call m(idct_8x8_internal_10bpc).main
+    call m(idct_8x16_internal_10bpc).main_evenhalf
+    psrad               m11, 8  ; pd_8
+    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(idct_16x8_internal_10bpc).pass1_rotations
+    REPX       {psrad x, 4}, m0, m1, m2,  m3,  m4,  m5,  m6,  m7, \
+                             m8, m9, m10, m11, m12, m13, m14, m15
+    ret
+
+INV_TXFM_16X16_FN adst, dct,      0, 12
+INV_TXFM_16X16_FN adst, adst,     0, 12
+INV_TXFM_16X16_FN adst, flipadst, 0, 12
+
+cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+    vpbroadcastd        m13, [clip_20b_min]
+    vpbroadcastd        m14, [clip_20b_max]
+    jmp m(iadst_16x16_internal_10bpc).pass1
+.pass2:
+    call .pass2_part1
+    call m(iadst_16x8_internal_10bpc).pass1_rotations
+    call .pass2_part2
+    call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_part3:
+    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+    REPX      {psrad x, 16}, m4, m5, m6, m7, m8,  m9,  m10, m11
+.end:
+    packssdw            m15, m14
+    packssdw            m14, m13, m12
+    packssdw            m13, m11, m10
+    packssdw            m12,  m9,  m8
+    packssdw            m11,  m7,  m6
+    packssdw            m10,  m5,  m4
+    packssdw             m7,  m3,  m2
+    packssdw             m6,  m1,  m0
+    vpblendd             m0, m6, [r5-32*4], 0x33
+    vpblendd             m1, m6, [r5-32*4], 0xcc
+    vpblendd             m2, m7, [r5-32*3], 0x33
+    vpblendd             m3, m7, [r5-32*3], 0xcc
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    call m(idct_16x8_internal_12bpc).write_16x4_start
+    call m(idct_16x8_internal_10bpc).write_16x4_zero
+    vpblendd             m0, m10, [r5-32*2], 0x33
+    vpblendd             m1, m10, [r5-32*2], 0xcc
+    vpblendd             m2, m11, [r5-32*1], 0x33
+    vpblendd             m3, m11, [r5-32*1], 0xcc
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    call m(idct_16x8_internal_10bpc).write_16x4_zero
+    vpblendd             m0, m12, [r5+32*0], 0x33
+    vpblendd             m1, m12, [r5+32*0], 0xcc
+    vpblendd             m2, m13, [r5+32*1], 0x33
+    vpblendd             m3, m13, [r5+32*1], 0xcc
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    call m(idct_16x8_internal_10bpc).write_16x4_zero
+    vpblendd             m0, m14, [r5+32*2], 0x33
+    vpblendd             m1, m14, [r5+32*2], 0xcc
+    vpblendd             m2, m15, [r5+32*3], 0x33
+    vpblendd             m3, m15, [r5+32*3], 0xcc
+    vpermq               m0, m0, q3120
+    vpermq               m1, m1, q2031
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q2031
+    call m(idct_16x8_internal_10bpc).write_16x4_zero
+    RET
+ALIGN function_align
+.pass2_part1:
+    mova         [cq+32* 8], m8
+    mova         [cq+32* 9], m9
+    mova         [cq+32*10], m10
+    mova         [cq+32*11], m11
+    mova         [cq+32*12], m12
+    mova         [cq+32*13], m13
+    mova         [cq+32*14], m14
+    mova         [cq+32*15], m15
+.pass2_main:
+    call m(idct_8x8_internal_12bpc).transpose_8x8
+    mova         [cq+32* 0], m0
+    mova         [cq+32* 1], m3
+    mova         [cq+32* 2], m4
+    mova         [cq+32* 3], m7
+    vpbroadcastd        m13, [clip_18b_min]
+    vpbroadcastd        m14, [clip_18b_max]
+    pmaxsd               m0, m13, m2
+    pmaxsd               m2, m13, m6
+    pmaxsd               m5, m13, m5
+    pmaxsd               m7, m13, m1
+    REPX    {pminsd x, m14}, m0, m2, m5, m7
+    test               eobd, eobd
+    jge .pass2_slow
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m3, m4, m6
+    jmp .pass2_fast
+.pass2_slow:
+    sub                  r6, 32*8
+    mova                 m8, [r6-32*4]
+    mova                 m3, [r6-32*3]
+    mova                 m4, [r6-32*2]
+    mova                m11, [r6-32*1]
+    mova                m12, [r6+32*0]
+    mova                 m1, [r6+32*1]
+    mova                 m6, [r6+32*2]
+    mova                m15, [r6+32*3]
+    TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
+    mova         [cq+32* 4], m8
+    mova         [cq+32* 5], m11
+    mova         [cq+32* 6], m12
+    mova         [cq+32* 7], m15
+    vpbroadcastd        m13, [clip_18b_min]
+    vpbroadcastd        m14, [clip_18b_max]
+    REPX    {pmaxsd x, m13}, m1, m3, m4, m6
+    REPX    {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast:
+    vpbroadcastd        m12, [pd_2048]
+    vpbroadcastd        m15, [pd_2896]
+    call m(iadst_16x8_internal_10bpc).main_part1
+    pmaxsd               m0, m13, [cq+32* 0] ;  0
+    pmaxsd               m7, m13, [cq+32* 1] ;  3
+    pmaxsd               m2, m13, [cq+32* 2] ;  4
+    pmaxsd               m5, m13, [cq+32* 3] ;  7
+    REPX    {pminsd x, m14}, m0, m2, m5, m7
+    test               eobd, eobd
+    jge .pass2_slow2
+    pxor                 m1, m1
+    REPX       {mova x, m1}, m3, m4, m6
+    jmp .pass2_fast2
+.pass2_slow2:
+    pmaxsd               m4, m13, [cq+32* 4] ;  8
+    pmaxsd               m3, m13, [cq+32* 5] ; 11
+    pmaxsd               m6, m13, [cq+32* 6] ; 12
+    pmaxsd               m1, m13, [cq+32* 7] ; 15
+    REPX    {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast2:
+    call m(iadst_16x8_internal_10bpc).main_part2
+    vpbroadcastd        m14, [pd_34816]
+    psrld               m15, 11              ; pd_1
+    psubd               m13, m14, m15        ; pd_34815
+    pslld               m15, 3               ; pd_8
+    ret
+ALIGN function_align
+.pass2_part2:
+    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+    REPX      {psrad x, 16}, m4, m5, m6, m7, m8,  m9,  m10, m11
+    packssdw             m0,  m1
+    packssdw             m1,  m2,  m3
+    packssdw             m2,  m4,  m5
+    packssdw             m3,  m6,  m7
+    packssdw             m4,  m8,  m9
+    packssdw             m5, m10, m11
+    packssdw             m6, m12, m13
+    packssdw             m7, m14, m15
+    mova          [r6-32*4], m0
+    mova          [r6-32*3], m1
+    mova          [r6-32*2], m2
+    mova          [r6-32*1], m3
+    mova          [r6+32*0], m4
+    mova          [r6+32*1], m5
+    mova          [r6+32*2], m6
+    mova          [r6+32*3], m7
+    mova                 m0, [cq+32* 8]
+    mova                 m1, [cq+32* 9]
+    mova                 m2, [cq+32*10]
+    mova                 m3, [cq+32*11]
+    mova                 m4, [cq+32*12]
+    mova                 m5, [cq+32*13]
+    mova                 m6, [cq+32*14]
+    mova                 m7, [cq+32*15]
+    mov                  r5, r6
+    add                  r6, 32*16
+    jmp .pass2_main
+
+INV_TXFM_16X16_FN flipadst, dct,      0, 12
+INV_TXFM_16X16_FN flipadst, adst,     0, 12
+INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
+
+cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+    vpbroadcastd        m13, [clip_20b_min]
+    vpbroadcastd        m14, [clip_20b_max]
+    jmp m(iflipadst_16x16_internal_10bpc).pass1
+.pass2:
+    call m(iadst_16x16_internal_12bpc).pass2_part1
+    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+    call m(iadst_16x16_internal_12bpc).pass2_part2
+    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+    jmp m(iadst_16x16_internal_12bpc).pass2_part3
+
+INV_TXFM_16X16_FN identity, dct,    -92, 12
+INV_TXFM_16X16_FN identity, identity, 0, 12
+
+cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+    jmp m(iidentity_16x16_internal_10bpc).pass1
+.pass2:
+    call m(iidentity_8x16_internal_12bpc).pass2_main
+    call m(idct_16x16_internal_10bpc).transpose_fast
+    test               eobd, eobd
+    jl .pass2_fast
+    mova         [cq+32* 8], m0
+    mova         [cq+32* 9], m1
+    mova         [cq+32*10], m2
+    mova         [cq+32*11], m3
+    mova         [cq+32*12], m4
+    mova         [cq+32*13], m5
+    mova         [cq+32*14], m6
+    mova         [cq+32*15], m7
+    mova                 m8, [r6-32*4]
+    mova                 m9, [r6-32*3]
+    mova                m10, [r6-32*2]
+    mova                m11, [r6-32*1]
+    mova                m12, [r6+32*0]
+    mova                m13, [r6+32*1]
+    mova                m14, [r6+32*2]
+    mova                m15, [r6+32*3]
+    sub                  r6, 32*8
+    mova                 m0, [r6-32*4]
+    mova                 m1, [r6-32*3]
+    mova                 m2, [r6-32*2]
+    mova                 m3, [r6-32*1]
+    mova                 m4, [r6+32*0]
+    mova                 m5, [r6+32*1]
+    mova                 m6, [r6+32*2]
+    mova                 m7, [r6+32*3]
+    call m(iidentity_8x16_internal_12bpc).pass2_main
+    call m(idct_16x8_internal_10bpc).transpose2
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+    mova                m11, m3
+    mova                m12, m4
+    mova                m13, m5
+    mova                m14, m6
+    mova                m15, m7
+    mova                 m0, [cq+32* 8]
+    mova                 m1, [cq+32* 9]
+    mova                 m2, [cq+32*10]
+    mova                 m3, [cq+32*11]
+    mova                 m4, [cq+32*12]
+    mova                 m5, [cq+32*13]
+    mova                 m6, [cq+32*14]
+    mova                 m7, [cq+32*15]
+.pass2_fast:
+    call m(idct_16x16_internal_12bpc).write_16x16
+    RET
+
 %macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
    mova                m%4, [r6+32*(%1-4)]
    mova                m%2, [r5+32*(3-%1)]
--- a/third_party/dav1d/src/x86/itx16_sse.asm
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@ -101,6 +101,8 @@ pixel_10bpc_max: times 8 dw  0x03ff

 pw_1567_3784:    times 4 dw  1567,  3784
 pw_m3784_1567:   times 4 dw -3784,  1567
+pw_2896_2896:    times 4 dw  2896,  2896
+pw_m2896_2896:   times 4 dw -2896,  2896

 clip_18b_min: times 4 dd -0x20000
 clip_18b_max: times 4 dd  0x1ffff
@ -429,22 +431,19 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    ; m0 = in0 in1
    ; m1 = in2 in3
    ; m5 = pd_2048
-    mova                 m4, [o(pw_m3784_1567)]
    punpckhwd            m2, m1, m0
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3
-    pmaddwd              m4, m2
+    punpcklwd            m1, m0
+    pmaddwd              m4, m2, [o(pw_m3784_1567)]
    pmaddwd              m2, [o(pw_1567_3784)]
-    pmulhrsw             m0, [o(pw_2896x8)]     ; t0 t1
-    paddd                m4, m5
-    paddd                m2, m5
-    psrad                m4, 12
-    psrad                m2, 12
-    packssdw             m2, m4     ; t3 t2
-    psubsw               m1, m0, m2 ; tmp3 tmp2
-    paddsw               m0, m2     ; tmp0 tmp1
+    pmaddwd              m0, m1, [o(pw_m2896_2896)]
+    pmaddwd              m1, [o(pw_2896_2896)]
+    REPX      {paddd x, m5}, m4, m2, m0, m1
    packssdw             m5, m5     ; pw_2048
+    REPX      {psrad x, 12}, m4, m2, m0, m1
+    packssdw             m2, m4     ; t3 t2
+    packssdw             m1, m0     ; t0 t1
+    paddsw               m0, m1, m2 ; out0 out1
+    psubsw               m1, m2     ; out3 out2
    pmulhrsw             m0, m5
    pmulhrsw             m1, m5
    movq                 m2, [dstq+strideq*0]
--- a/third_party/dav1d/src/x86/itx_init_tmpl.c
+++ b/third_party/dav1d/src/x86/itx_init_tmpl.c
@ -273,6 +273,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
    assign_itx16_bpc_fn(R,  8, 16, 12, avx2);
    assign_itx16_bpc_fn(R, 16,  4, 12, avx2);
    assign_itx16_bpc_fn(R, 16,  8, 12, avx2);
+    assign_itx12_bpc_fn( , 16, 16, 12, avx2);
 #endif

    if (bpc > 10) return;
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@ -3017,11 +3017,11 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx,
 %endif
    dec                  hd
    jz .ret
-    mova                xm8, [rsp+0x00]
-    movd                xm9, [rsp+0x30]
    add                 myd, dyd
    test                myd, ~0x3ff
    jz .w4_loop
+    mova                xm8, [rsp+0x00]
+    movd                xm9, [rsp+0x30]
    movu                xm4, [srcq]
    movu                xm5, [srcq+r4]
    test                myd, 0x400
@ -5789,7 +5789,7 @@ cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
    vpbroadcastd         m5, dxm
    vpbroadcastd         m8, mx0m
    vpbroadcastd         m6, src_wm
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr, _, pxmax
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
    LEA                  r7, $$
 %define base r7-$$
    vpbroadcastd         m3, [base+pd_64]
--- a/third_party/dav1d/src/x86/mc16_avx512.asm
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@ -131,6 +131,16 @@ warp8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
 warp8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
 warp_shift_h:  db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
 blend_shuf:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+resize_permA:  dd  0,  4,  8, 12,  1,  5,  9, 13, 16, 20, 24, 28, 17, 21, 25, 29
+resize_permB:  dd  2,  6, 10, 14,  3,  7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
+resize_permC:  dq  0,  1,  4,  5,  8,  9, 12, 13
+resize_permD:  dq  2,  3,  6,  7, 10, 11, 14, 15
+resize_permE:  dq  0,  2,  4,  6
+resize_shufA:  db -1,  0, -1,  1, -1,  4, -1,  5, -1,  8, -1,  9, -1, 12, -1, 13
+resize_shufB:  db -1,  2, -1,  3, -1,  6, -1,  7, -1, 10, -1, 11, -1, 14, -1, 15
+rescale_mul:   dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+resize_shuf:   db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
+               db  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15

 prep_hv_shift:    dq  6,  4
 put_bilin_h_rnd:  dw  8,  8, 10, 10
@ -151,9 +161,12 @@ pw_m512:  times 2 dw -512
 pw_2:     times 2 dw 2
 pw_64:    times 2 dw 64
 pd_32:    dd 32
+pd_63:    dd 63
 pd_128:   dd 128
 pd_640:   dd 640
 pd_2176:  dd 2176
+pd_16384: dd 16384
+pd_0_4:   dd 0, 4

 %define pw_16 prep_mul
 %define pd_512 warp_8x8_rnd_h
@ -237,6 +250,7 @@ cextern mc_subpel_filters

 cextern mc_warp_filter
 cextern obmc_masks_avx2
+cextern resize_filter

 SECTION .text

@ -4708,4 +4722,145 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
    jl .w128
    RET

+cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
+                                 dst_w, h, src_w, dx, mx0, pxmax
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    mov                  r6, ~0
+    vpbroadcastd         m5, dxm
+    vpbroadcastd         m8, mx0m
+    vpbroadcastd         m6, src_wm
+    kmovq                k6, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+    LEA                  r7, $$
+%define base r7-$$
+    vpbroadcastd         m3, [base+pd_16384]
+    vpbroadcastd         m7, [base+pd_63]
+    mova                m24, [base+resize_permA]
+    mova                m25, [base+resize_permB]
+    mova                m26, [base+resize_permC]
+    mova                m27, [base+resize_permD]
+    vbroadcasti32x4     m28, [base+resize_shufA]
+    vbroadcasti32x4     m29, [base+resize_shufB]
+    mova                m30, [base+resize_permE]
+    vpbroadcastw       ym31, pxmaxm
+    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+    pslld                m5, 4                      ; dx*16
+    pslld                m6, 14
+    pxor                 m2, m2
+.loop_y:
+    xor                  xd, xd
+    mova                 m4, m8     ; per-line working version of mx
+.loop_x:
+    pmaxsd               m0, m4, m2
+    psrad                m9, m4, 8  ; filter offset (unmasked)
+    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
+    psubd                m1, m4, m0 ; pshufb offset
+    psrad                m0, 14     ; clipped src_x offset
+    psrad                m1, 14     ; pshufb edge_emu offset
+    vptestmd             k5, m1, m1
+    pand                 m9, m7     ; filter offset (masked)
+    ktestw               k5, k5
+    jz .load
+    vpbroadcastq        m14, [base+pd_0_4]
+    vpermq              m10, m0, q1100
+    vpermq              m11, m0, q3322
+    vpermq              m20, m1, q1100
+    vpermq              m21, m1, q3322
+    punpckldq           m10, m10
+    punpckldq           m11, m11
+    punpckldq           m20, m20
+    punpckldq           m21, m21
+    paddd               m10, m14
+    paddd               m11, m14
+    paddd               m20, m14
+    paddd               m21, m14
+    vextracti32x8      ym12, m10, 1
+    vextracti32x8      ym13, m11, 1
+    vextracti32x8      ym22, m20, 1
+    vextracti32x8      ym23, m21, 1
+    kmovq                k1, k6
+    kmovq                k2, k6
+    kmovq                k3, k6
+    kmovq                k4, k6
+    vpgatherdq      m16{k1}, [srcq+ym10*2] ; 0 1 2 3
+    vpgatherdq      m17{k2}, [srcq+ym11*2] ; 4 5 6 7
+    vpgatherdq      m18{k3}, [srcq+ym12*2] ; 8 9 A B
+    vpgatherdq      m19{k4}, [srcq+ym13*2] ; C D E F
+    kmovq                k1, k6
+    kmovq                k2, k6
+    kmovq                k3, k6
+    kmovq                k4, k6
+    vpgatherdq       m0{k1}, [base+resize_shuf+8+ym20*2]
+    vpgatherdq       m1{k2}, [base+resize_shuf+8+ym21*2]
+    vpgatherdq      m14{k3}, [base+resize_shuf+8+ym22*2]
+    vpgatherdq      m15{k4}, [base+resize_shuf+8+ym23*2]
+    pshufb              m16, m0
+    pshufb              m17, m1
+    pshufb              m18, m14
+    pshufb              m19, m15
+    mova                m20, m24
+    mova                m22, m24
+    mova                m21, m25
+    mova                m23, m25
+    vpermi2d            m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
+    vpermi2d            m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
+    vpermi2d            m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
+    vpermi2d            m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
+    mova                m15, m26
+    mova                m17, m26
+    mova                m16, m27
+    mova                m18, m27
+    vpermi2q            m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
+    vpermi2q            m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
+    vpermi2q            m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
+    vpermi2q            m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
+    kmovq                k1, k6
+    kmovq                k2, k6
+    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
+    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
+    pshufb              m10, m11, m28
+    pshufb              m11, m11, m29
+    pshufb              m12, m13, m28
+    pshufb              m13, m13, m29
+    jmp .filter
+.load:
+    kmovq                k1, k6
+    kmovq                k2, k6
+    kmovq                k3, k6
+    kmovq                k4, k6
+    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
+    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
+    pshufb              m10, m11, m28
+    pshufb              m11, m11, m29
+    pshufb              m12, m13, m28
+    pshufb              m13, m13, m29
+    vpgatherdd      m15{k3}, [srcq+m0*2+ 0]
+    vpgatherdd      m16{k4}, [srcq+m0*2+ 4]
+    kmovq                k1, k6
+    kmovq                k2, k6
+    vpgatherdd      m17{k1}, [srcq+m0*2+ 8]
+    vpgatherdd      m18{k2}, [srcq+m0*2+12]
+.filter:
+    mova                m14, m2
+    vpdpwssd            m14, m15, m10
+    vpdpwssd            m14, m16, m11
+    vpdpwssd            m14, m17, m12
+    vpdpwssd            m14, m18, m13
+    psubd               m14, m3, m14
+    psrad               m14, 15
+    packusdw            m14, m14
+    vpermq              m14, m30, m14
+    pminsw             ym14, ym31
+    mova        [dstq+xq*2], ym14
+    paddd                m4, m5
+    add                  xd, 16
+    cmp                  xd, dst_wd
+    jl .loop_x
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+    jg .loop_y
+    RET
+
 %endif ; ARCH_X86_64
--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@ -5046,11 +5046,11 @@ cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
    vpbroadcastd         m8, mx0m
    vpbroadcastd         m6, src_wm

-    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
    LEA                  r7, $$
 %define base r7-$$

-    vpbroadcastd         m3, [base+pw_m256]
+    vpbroadcastd        xm3, [base+pw_m256]
    vpbroadcastd         m7, [base+pd_63]
    vbroadcasti128      m15, [base+pb_8x0_8x8]
    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@ -193,29 +193,39 @@ bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 1
 bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
 bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+rescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+resize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
+resize_permA:   dd  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB:   dd  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC:   dd  0,  4,  8, 12
 pb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7

 wm_420_perm64:  dq 0xfedcba9876543210
 wm_sign:        dd 0x40804080, 0xc0c0c0c0, 0x40404040

-pb_127:   times 4 db 127
-pw_m128   times 2 dw -128
-pw_1024:  times 2 dw 1024
-pw_2048:  times 2 dw 2048
-pw_6903:  times 2 dw 6903
-pw_8192:  times 2 dw 8192
-pd_32:            dd 32
-pd_34:            dd 34
-pd_512:           dd 512
-pd_32768:         dd 32768
+pb_8x0_8x8: times 8 db 0
+            times 8 db 8
+pb_127:     times 4 db 127
+pw_m128     times 2 dw -128
+pw_m256:    times 2 dw -256
+pw_1024:    times 2 dw 1024
+pw_2048:    times 2 dw 2048
+pw_6903:    times 2 dw 6903
+pw_8192:    times 2 dw 8192
+pd_32:              dd 32
+pd_34:              dd 34
+pd_63:              dd 63
+pd_512:             dd 512
+pd_32768:           dd 32768

 %define pb_m64 (wm_sign+4)
 %define pb_64  (wm_sign+8)
 %define pd_2   (pd_0to7+8)

 cextern mc_subpel_filters
-cextern mc_warp_filter
 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter

 %macro BASE_JMP_TABLE 3-*
    %xdefine %1_%2_table (%%table - %3)
@ -4450,4 +4460,87 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
    jl .w128
    RET

+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+                                dst_w, h, src_w, dx, mx0
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    mov                  r6, ~0
+    vpbroadcastd         m5, dxm
+    vpbroadcastd         m8, mx0m
+    vpbroadcastd         m6, src_wm
+    kmovq                k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+    LEA                  r7, $$
+%define base r7-$$
+    vpbroadcastd         m3, [base+pw_m256]
+    vpbroadcastd         m7, [base+pd_63]
+    vbroadcasti32x4     m15, [base+pb_8x0_8x8]
+    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+    pslld                m5, 4                      ; dx*16
+    pslld                m6, 14
+    pxor                 m2, m2
+    mova                m16, [base+resize_permA]
+    mova                m17, [base+resize_permB]
+    mova               xm18, [base+resize_permC]
+.loop_y:
+    xor                  xd, xd
+    mova                 m4, m8     ; per-line working version of mx
+.loop_x:
+    pmaxsd               m0, m4, m2
+    psrad                m9, m4, 8  ; filter offset (unmasked)
+    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
+    psubd                m1, m4, m0 ; pshufb offset
+    psrad                m0, 14     ; clipped src_x offset
+    psrad                m1, 14     ; pshufb edge_emu offset
+    vptestmd             k4, m1, m1
+    pand                 m9, m7     ; filter offset (masked)
+    ktestw               k4, k4
+    jz .load
+    vextracti32x8      ym12, m0, 1
+    vextracti32x8      ym13, m1, 1
+    kmovq                k1, k3
+    kmovq                k2, k3
+    vpgatherdq      m10{k1}, [srcq+ym0]
+    vpgatherdq      m11{k2}, [srcq+ym12]
+    kmovq                k1, k3
+    kmovq                k2, k3
+    vpgatherdq      m14{k1}, [base+resize_shuf+4+ym1]
+    vpgatherdq       m0{k2}, [base+resize_shuf+4+ym13]
+    mova                m12, m16
+    mova                m13, m17
+    paddb               m14, m15
+    paddb                m0, m15
+    pshufb              m10, m14
+    pshufb              m11, m0
+    vpermi2d            m12, m10, m11
+    vpermi2d            m13, m10, m11
+    jmp .filter
+.load:
+    kmovq                k1, k3
+    kmovq                k2, k3
+    vpgatherdd      m12{k1}, [srcq+m0+0]
+    vpgatherdd      m13{k2}, [srcq+m0+4]
+.filter:
+    kmovq                k1, k3
+    kmovq                k2, k3
+    vpgatherdd      m10{k1}, [base+resize_filter+m9*8+0]
+    vpgatherdd      m11{k2}, [base+resize_filter+m9*8+4]
+    mova                m14, m2
+    vpdpbusd            m14, m12, m10
+    vpdpbusd            m14, m13, m11
+    packssdw            m14, m14
+    pmulhrsw            m14, m3
+    packuswb            m14, m14
+    vpermd              m14, m18, m14
+    mova          [dstq+xq], xm14
+    paddd                m4, m5
+    add                  xd, 16
+    cmp                  xd, dst_wd
+    jl .loop_x
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+    jg .loop_y
+    RET
+
 %endif ; ARCH_X86_64
--- a/third_party/dav1d/src/x86/mc_init_tmpl.c
+++ b/third_party/dav1d/src/x86/mc_init_tmpl.c
@ -152,7 +152,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);

-#if BITDEPTH == 8
    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
@ -174,7 +173,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
-#endif

    c->avg = BF(dav1d_avg, ssse3);
    c->w_avg = BF(dav1d_w_avg, ssse3);
@ -296,5 +294,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    c->blend_h = BF(dav1d_blend_h, avx512icl);
    c->warp8x8  = BF(dav1d_warp_affine_8x8, avx512icl);
    c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+    c->resize = BF(dav1d_resize, avx512icl);
 #endif
 }
--- a/third_party/dav1d/src/x86/mc_sse.asm
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@ -5170,9 +5170,9 @@ INIT_XMM ssse3
    mova         [esp+0x40], m2
    mova         [esp+0x50], m3
    MC_8TAP_SCALED_H   0x20, 0x140, 0 ; 0-1
-    MC_8TAP_SCALED_H   0x20, 0x160   ; 2-3
-    MC_8TAP_SCALED_H   0x20, 0x180   ; 4-5
-    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 6-7
+    MC_8TAP_SCALED_H   0x20, 0x160    ; 2-3
+    MC_8TAP_SCALED_H   0x20, 0x180    ; 4-5
+    MC_8TAP_SCALED_H   0x20, 0x1a0    ; 6-7
    mova                 m5, [esp+0x180]
    mova                 m6, [esp+0x190]
    mova                 m7, [esp+0x1a0]
@ -5201,9 +5201,6 @@ INIT_XMM ssse3
 .vloop:
    mov                  r0, r0m
    mov                  r5, [esp+0x1f4]
- %ifidn %1, put
-    mov                 dsd, dsm
- %endif
    and                 myd, 0x3ff
    mov                 mym, myd
    xor                  r3, r3
@ -5244,13 +5241,10 @@ INIT_XMM ssse3
 %ifidn %1, put
    packuswb             m4, m4
    movq             [dstq], m4
-    add                dstq, dsq
+    add                dstq, dsm
 %else
    mova             [tmpq], m4
    add                tmpq, tmp_stridem
-%endif
-%if ARCH_X86_32
-    mov                 r0m, r0
 %endif
    dec                  hd
    jz .hloop_prep
@ -5329,6 +5323,7 @@ INIT_XMM ssse3
    mova         [rsp+0x70], m7
    mova         [rsp+0x80], m4
 %else
+    mov                 r0m, r0
    mov                 myd, mym
    mov                  r3, r3m
    add                 myd, dym
@ -5745,7 +5740,7 @@ INIT_XMM ssse3
    movu                 m1, [srcq+ssq*0]
    movu                 m2, [srcq+ssq*1]
    movu                 m3, [srcq+ssq*2]
-    lea                srcq, [srcq+ss3q ]
+    add                srcq, ss3q
    punpcklqdq           m6, m6
    SWAP                 m4, m7
    pand                 m7, m11, m8
@ -6723,7 +6718,7 @@ INIT_XMM ssse3
    movu                 m1, [srcq+ssq*0]
    movu                 m2, [srcq+ssq*2]
    movu                 m3, [srcq+ssq*1]
-    lea                srcq, [srcq+ss3q ]
+    add                srcq, ss3q
    punpcklqdq           m6, m6
    SWAP                 m4, m7
    pand                 m7, m11, m8
@ -6734,7 +6729,7 @@ INIT_XMM ssse3
    movu                 m0, [srcq+ssq*0]
    movu                 m7, [srcq+ssq*1]
    movu                 m6, [srcq+ssq*2]
-    lea                srcq, [srcq+ss3q ]
+    add                srcq, ss3q
    pshufb               m1, m14
    pshufb               m2, m14
    pshufb               m3, m14
@ -9409,7 +9404,7 @@ cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
    pshufd               m5, m5, q0000

 %if ARCH_X86_64
-    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
    LEA                  r7, $$
 %define base r7-$$
 %else
--- a/third_party/dav1d/tests/checkasm/checkasm.h
+++ b/third_party/dav1d/tests/checkasm/checkasm.h
@ -311,11 +311,12 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
 #endif


+#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
 #define PIXEL_RECT(name, w, h) \
-    ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \
-    ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \
+    ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \
+    ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \
    (void)name##_stride; \
-    pixel *name = name##_buf + ((w)+64)*16 + 64
+    pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64

 #define CLEAR_PIXEL_RECT(name) \
    memset(name##_buf, 0x99, sizeof(name##_buf)) \
--- a/third_party/dav1d/tests/checkasm/ipred.c
+++ b/third_party/dav1d/tests/checkasm/ipred.c
@ -192,8 +192,8 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
 }

 static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    PIXEL_RECT(c_dst, 32, 32);
+    PIXEL_RECT(a_dst, 32, 32);
    ALIGN_STK_64(int16_t, ac, 32 * 32,);
    ALIGN_STK_64(pixel, topleft_buf, 257,);
    pixel *const topleft = topleft_buf + 128;
@ -215,8 +215,6 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
                    const int bitdepth_max = 0xff;
 #endif

-                    const ptrdiff_t stride = w * sizeof(pixel);
-
                    int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));

                    for (int i = -h * 2; i <= w * 2; i++)
@ -229,14 +227,17 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
                    for (int i = 0; i < w * h; i++)
                        ac[i] -= luma_avg;

-                    call_ref(c_dst, stride, topleft, w, h, ac, alpha
-                             HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, stride, topleft, w, h, ac, alpha
-                             HIGHBD_TAIL_SUFFIX);
-                    checkasm_check_pixel(c_dst, stride, a_dst, stride,
-                                         w, h, "dst");
+                    CLEAR_PIXEL_RECT(c_dst);
+                    CLEAR_PIXEL_RECT(a_dst);

-                    bench_new(a_dst, stride, topleft, w, h, ac, alpha
+                    call_ref(c_dst, c_dst_stride, topleft, w, h, ac, alpha
+                             HIGHBD_TAIL_SUFFIX);
+                    call_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
+                             HIGHBD_TAIL_SUFFIX);
+                    checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                                w, h, "dst");
+
+                    bench_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
                              HIGHBD_TAIL_SUFFIX);
                }
            }
@ -244,8 +245,8 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
 }

 static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    PIXEL_RECT(c_dst, 64, 64);
+    PIXEL_RECT(a_dst, 64, 64);
    ALIGN_STK_64(uint8_t, idx, 64 * 64,);
    ALIGN_STK_16(uint16_t, pal, 8,);

@ -261,7 +262,6 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
 #else
                const int bitdepth_max = 0xff;
 #endif
-                const ptrdiff_t stride = w * sizeof(pixel);

                for (int i = 0; i < 8; i++)
                    pal[i] = rnd() & bitdepth_max;
@ -269,11 +269,15 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
                for (int i = 0; i < w * h; i++)
                    idx[i] = rnd() & 7;

-                call_ref(c_dst, stride, pal, idx, w, h);
-                call_new(a_dst, stride, pal, idx, w, h);
-                checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);

-                bench_new(a_dst, stride, pal, idx, w, h);
+                call_ref(c_dst, c_dst_stride, pal, idx, w, h);
+                call_new(a_dst, a_dst_stride, pal, idx, w, h);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride,
+                                            a_dst, a_dst_stride, w, h, "dst");
+
+                bench_new(a_dst, a_dst_stride, pal, idx, w, h);
            }
    report("pal_pred");
 }
--- a/third_party/dav1d/tests/checkasm/itx.c
+++ b/third_party/dav1d/tests/checkasm/itx.c
@ -243,8 +243,8 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
                            const enum RectTxfmSize tx)
 {
    ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
-    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    PIXEL_RECT(c_dst, 64, 64);
+    PIXEL_RECT(a_dst, 64, 64);

    static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };

@ -275,21 +275,26 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
                    const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
                    memcpy(coeff[1], coeff[0], sizeof(*coeff));

-                    for (int j = 0; j < w * h; j++)
-                        c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
+                    CLEAR_PIXEL_RECT(c_dst);
+                    CLEAR_PIXEL_RECT(a_dst);

-                    call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
+                    for (int y = 0; y < h; y++)
+                        for (int x = 0; x < w; x++)
+                            c_dst[y*PXSTRIDE(c_dst_stride) + x] =
+                            a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
+
+                    call_ref(c_dst, c_dst_stride, coeff[0], eob
                             HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
+                    call_new(a_dst, a_dst_stride, coeff[1], eob
                             HIGHBD_TAIL_SUFFIX);

-                    checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
-                                         a_dst, w * sizeof(*a_dst),
-                                         w, h, "dst");
+                    checkasm_check_pixel_padded(c_dst, c_dst_stride,
+                                                a_dst, a_dst_stride,
+                                                w, h, "dst");
                    if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
                        fail();

-                    bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
+                    bench_new(a_dst, a_dst_stride, coeff[0], eob
                              HIGHBD_TAIL_SUFFIX);
                }
    }
--- a/third_party/dav1d/tests/checkasm/mc.c
+++ b/third_party/dav1d/tests/checkasm/mc.c
@ -57,8 +57,8 @@ static int mc_h_next(const int h) {

 static void check_mc(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
-    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
+    PIXEL_RECT(c_dst, 128, 128);
+    PIXEL_RECT(a_dst, 128, 128);
    const pixel *src = src_buf + 135 * 3 + 3;
    const ptrdiff_t src_stride = 135 * sizeof(pixel);

@ -68,7 +68,6 @@ static void check_mc(Dav1dMCDSPContext *const c) {

    for (int filter = 0; filter < N_2D_FILTERS; filter++)
        for (int w = 2; w <= 128; w <<= 1) {
-            const ptrdiff_t dst_stride = w * sizeof(pixel);
            for (int mxy = 0; mxy < 4; mxy++)
                if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
                    filter_names[filter], w, mxy_names[mxy], BITDEPTH))
@ -87,18 +86,21 @@ static void check_mc(Dav1dMCDSPContext *const c) {
                        for (int i = 0; i < 135 * 135; i++)
                            src_buf[i] = rnd() & bitdepth_max;

-                        call_ref(c_dst, dst_stride, src, src_stride, w, h,
+                        CLEAR_PIXEL_RECT(c_dst);
+                        CLEAR_PIXEL_RECT(a_dst);
+
+                        call_ref(c_dst, c_dst_stride, src, src_stride, w, h,
                                 mx, my HIGHBD_TAIL_SUFFIX);
-                        call_new(a_dst, dst_stride, src, src_stride, w, h,
+                        call_new(a_dst, a_dst_stride, src, src_stride, w, h,
                                 mx, my HIGHBD_TAIL_SUFFIX);
-                        checkasm_check_pixel(c_dst, dst_stride,
-                                             a_dst, dst_stride,
-                                             w, h, "dst");
+                        checkasm_check_pixel_padded(c_dst, c_dst_stride,
+                                                    a_dst, a_dst_stride,
+                                                    w, h, "dst");

                        if (filter == FILTER_2D_8TAP_REGULAR ||
                            filter == FILTER_2D_BILINEAR)
                        {
-                            bench_new(a_dst, dst_stride, src, src_stride, w, h,
+                            bench_new(a_dst, a_dst_stride, src, src_stride, w, h,
                                      mx, my HIGHBD_TAIL_SUFFIX);
                        }
                    }
@ -164,8 +166,8 @@ static void check_mct(Dav1dMCDSPContext *const c) {

 static void check_mc_scaled(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
-    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
-    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
+    PIXEL_RECT(c_dst, 128, 128);
+    PIXEL_RECT(a_dst, 128, 128);
    const pixel *src = src_buf + 263 * 3 + 3;
    const ptrdiff_t src_stride = 263 * sizeof(pixel);
 #if BITDEPTH == 16
@ -180,7 +182,6 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {

    for (int filter = 0; filter < N_2D_FILTERS; filter++)
        for (int w = 2; w <= 128; w <<= 1) {
-            const ptrdiff_t dst_stride = w * sizeof(pixel);
            for (int p = 0; p < 3; ++p) {
                if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
@ -198,16 +199,20 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {
                        for (int k = 0; k < 263 * 263; k++)
                            src_buf[k] = rnd() & bitdepth_max;

-                        call_ref(c_dst, dst_stride, src, src_stride,
+                        CLEAR_PIXEL_RECT(c_dst);
+                        CLEAR_PIXEL_RECT(a_dst);
+
+                        call_ref(c_dst, c_dst_stride, src, src_stride,
                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
-                        call_new(a_dst, dst_stride, src, src_stride,
+                        call_new(a_dst, a_dst_stride, src, src_stride,
                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
-                        checkasm_check_pixel(c_dst, dst_stride,
-                                             a_dst, dst_stride, w, h, "dst");
+                        checkasm_check_pixel_padded(c_dst, c_dst_stride,
+                                                    a_dst, a_dst_stride,
+                                                    w, h, "dst");

                        if (filter == FILTER_2D_8TAP_REGULAR ||
                            filter == FILTER_2D_BILINEAR)
-                            bench_new(a_dst, dst_stride, src, src_stride,
+                            bench_new(a_dst, a_dst_stride, src, src_stride,
                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
                    }
                }
@ -281,15 +286,14 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,

 static void check_avg(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+    PIXEL_RECT(c_dst, 135, 135);
+    PIXEL_RECT(a_dst, 128, 128);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                 const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);

    for (int w = 4; w <= 128; w <<= 1)
        if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
-            ptrdiff_t dst_stride = w * sizeof(pixel);
            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
            {
 #if BITDEPTH == 16
@ -299,12 +303,16 @@ static void check_avg(Dav1dMCDSPContext *const c) {
 #endif

                init_tmp(c, c_dst, tmp, bitdepth_max);
-                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");

-                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);
+
+                call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                            w, h, "dst");
+
+                bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
            }
        }
    report("avg");
@ -312,15 +320,14 @@ static void check_avg(Dav1dMCDSPContext *const c) {

 static void check_w_avg(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+    PIXEL_RECT(c_dst, 135, 135);
+    PIXEL_RECT(a_dst, 128, 128);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                 const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);

    for (int w = 4; w <= 128; w <<= 1)
        if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
-            ptrdiff_t dst_stride = w * sizeof(pixel);
            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
            {
                int weight = rnd() % 15 + 1;
@ -331,12 +338,15 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
 #endif
                init_tmp(c, c_dst, tmp, bitdepth_max);

-                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);

-                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+                call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride,a_dst, a_dst_stride,
+                                            w, h, "dst");
+
+                bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
            }
        }
    report("w_avg");
@ -344,8 +354,8 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {

 static void check_mask(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_64(pixel,   c_dst, 135 * 135,);
-    ALIGN_STK_64(pixel,   a_dst, 128 * 128,);
+    PIXEL_RECT(c_dst, 135, 135);
+    PIXEL_RECT(a_dst, 128, 128);
    ALIGN_STK_64(uint8_t, mask,  128 * 128,);

    for (int i = 0; i < 128 * 128; i++)
@ -357,7 +367,6 @@ static void check_mask(Dav1dMCDSPContext *const c) {

    for (int w = 4; w <= 128; w <<= 1)
        if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
-            ptrdiff_t dst_stride = w * sizeof(pixel);
            for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
            {
 #if BITDEPTH == 16
@ -366,12 +375,16 @@ static void check_mask(Dav1dMCDSPContext *const c) {
                const int bitdepth_max = 0xff;
 #endif
                init_tmp(c, c_dst, tmp, bitdepth_max);
-                call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");

-                bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);
+
+                call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                            w, h, "dst");
+
+                bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
            }
        }
    report("mask");
@ -379,8 +392,8 @@ static void check_mask(Dav1dMCDSPContext *const c) {

 static void check_w_mask(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_64(pixel,   c_dst,  135 * 135,);
-    ALIGN_STK_64(pixel,   a_dst,  128 * 128,);
+    PIXEL_RECT(c_dst, 135, 135);
+    PIXEL_RECT(a_dst, 128, 128);
    ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
    ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);

@ -397,7 +410,6 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
            if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
                           BITDEPTH))
            {
-                ptrdiff_t dst_stride = w * sizeof(pixel);
                for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
                {
                    int sign = rnd() & 1;
@ -408,19 +420,22 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
 #endif
                    init_tmp(c, c_dst, tmp, bitdepth_max);

-                    call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
+                    CLEAR_PIXEL_RECT(c_dst);
+                    CLEAR_PIXEL_RECT(a_dst);
+
+                    call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h,
                             c_mask, sign HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
+                    call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
                             a_mask, sign HIGHBD_TAIL_SUFFIX);
-                    checkasm_check_pixel(c_dst, dst_stride,
-                                         a_dst, dst_stride,
-                                         w, h, "dst");
+                    checkasm_check_pixel_padded(c_dst, c_dst_stride,
+                                                a_dst, a_dst_stride,
+                                                w, h, "dst");
                    checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
                                            a_mask, w >> ss_hor[i],
                                            w >> ss_hor[i], h >> ss_ver[i],
                                            "mask");

-                    bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
+                    bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
                              a_mask, sign HIGHBD_TAIL_SUFFIX);
                }
            }
@ -429,15 +444,14 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {

 static void check_blend(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, tmp, 32 * 32,);
-    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    PIXEL_RECT(c_dst, 32, 32);
+    PIXEL_RECT(a_dst, 32, 32);
    ALIGN_STK_64(uint8_t, mask, 32 * 32,);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h, const uint8_t *mask);

    for (int w = 4; w <= 32; w <<= 1) {
-        const ptrdiff_t dst_stride = w * sizeof(pixel);
        if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
            for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
 #if BITDEPTH == 16
@ -449,15 +463,21 @@ static void check_blend(Dav1dMCDSPContext *const c) {
                    tmp[i] = rnd() & bitdepth_max;
                    mask[i] = rnd() % 65;
                }
-                for (int i = 0; i < w * h; i++)
-                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;

-                call_ref(c_dst, dst_stride, tmp, w, h, mask);
-                call_new(a_dst, dst_stride, tmp, w, h, mask);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);

-                bench_new(a_dst, dst_stride, tmp, w, h, mask);
+                for (int y = 0; y < h; y++)
+                    for (int x = 0; x < w; x++)
+                        c_dst[y*PXSTRIDE(c_dst_stride) + x] =
+                        a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
+
+                call_ref(c_dst, c_dst_stride, tmp, w, h, mask);
+                call_new(a_dst, a_dst_stride, tmp, w, h, mask);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                            w, h, "dst");
+
+                bench_new(a_dst, a_dst_stride, tmp, w, h, mask);
            }
    }
    report("blend");
@ -465,14 +485,13 @@ static void check_blend(Dav1dMCDSPContext *const c) {

 static void check_blend_v(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, tmp,   32 * 128,);
-    ALIGN_STK_64(pixel, c_dst, 32 * 128,);
-    ALIGN_STK_64(pixel, a_dst, 32 * 128,);
+    PIXEL_RECT(c_dst, 32, 128);
+    PIXEL_RECT(a_dst, 32, 128);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h);

    for (int w = 2; w <= 32; w <<= 1) {
-        const ptrdiff_t dst_stride = w * sizeof(pixel);
        if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
            for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
 #if BITDEPTH == 16
@ -481,17 +500,23 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
                const int bitdepth_max = 0xff;
 #endif

-                for (int i = 0; i < w * h; i++)
-                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);
+
+                for (int y = 0; y < h; y++)
+                    for (int x = 0; x < w; x++)
+                        c_dst[y*PXSTRIDE(c_dst_stride) + x] =
+                        a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
+
                for (int i = 0; i < 32 * 128; i++)
                    tmp[i] = rnd() & bitdepth_max;

-                call_ref(c_dst, dst_stride, tmp, w, h);
-                call_new(a_dst, dst_stride, tmp, w, h);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");
+                call_ref(c_dst, c_dst_stride, tmp, w, h);
+                call_new(a_dst, a_dst_stride, tmp, w, h);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                            w, h, "dst");

-                bench_new(a_dst, dst_stride, tmp, w, h);
+                bench_new(a_dst, a_dst_stride, tmp, w, h);
            }
    }
    report("blend_v");
@ -499,14 +524,13 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {

 static void check_blend_h(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, tmp,   128 * 32,);
-    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    PIXEL_RECT(c_dst, 128, 32);
+    PIXEL_RECT(a_dst, 128, 32);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                 int w, int h);

    for (int w = 2; w <= 128; w <<= 1) {
-        const ptrdiff_t dst_stride = w * sizeof(pixel);
        if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
            for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
 #if BITDEPTH == 16
@ -514,17 +538,23 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
 #else
                const int bitdepth_max = 0xff;
 #endif
-                for (int i = 0; i < w * h; i++)
-                    c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
+                CLEAR_PIXEL_RECT(c_dst);
+                CLEAR_PIXEL_RECT(a_dst);
+
+                for (int y = 0; y < h; y++)
+                    for (int x = 0; x < w; x++)
+                        c_dst[y*PXSTRIDE(c_dst_stride) + x] =
+                        a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
+
                for (int i = 0; i < 128 * 32; i++)
                    tmp[i] = rnd() & bitdepth_max;

-                call_ref(c_dst, dst_stride, tmp, w, h);
-                call_new(a_dst, dst_stride, tmp, w, h);
-                checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                                     w, h, "dst");
+                call_ref(c_dst, c_dst_stride, tmp, w, h);
+                call_new(a_dst, a_dst_stride, tmp, w, h);
+                checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                            w, h, "dst");

-                bench_new(a_dst, dst_stride, tmp, w, h);
+                bench_new(a_dst, a_dst_stride, tmp, w, h);
            }
    }
    report("blend_h");
@ -532,11 +562,10 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {

 static void check_warp8x8(Dav1dMCDSPContext *const c) {
    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
-    ALIGN_STK_64(pixel, c_dst,    8 *  8,);
-    ALIGN_STK_64(pixel, a_dst,    8 *  8,);
+    PIXEL_RECT(c_dst, 8, 8);
+    PIXEL_RECT(a_dst, 8, 8);
    int16_t abcd[4];
    const pixel *src = src_buf + 15 * 3 + 3;
-    const ptrdiff_t dst_stride =  8 * sizeof(pixel);
    const ptrdiff_t src_stride = 15 * sizeof(pixel);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
@ -558,12 +587,15 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
        for (int i = 0; i < 15 * 15; i++)
            src_buf[i] = rnd() & bitdepth_max;

-        call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
-        call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
-        checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                             8, 8, "dst");
+        CLEAR_PIXEL_RECT(c_dst);
+        CLEAR_PIXEL_RECT(a_dst);

-        bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        call_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
+        checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                    8, 8, "dst");
+
+        bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
    }
    report("warp8x8");
 }
@ -687,13 +719,12 @@ static int get_upscale_x0(const int in_w, const int out_w, const int step) {
 }

 static void check_resize(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
-    ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
-    ALIGN_STK_64(pixel, src,   512 * 64,);
+    PIXEL_RECT(c_dst, 1024, 64);
+    PIXEL_RECT(a_dst, 1024, 64);
+    ALIGN_STK_64(pixel, src, 512 * 64,);

    const int height = 64;
    const int max_src_width = 512;
-    const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
    const ptrdiff_t src_stride = 512 * sizeof(pixel);

    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -720,14 +751,17 @@ static void check_resize(Dav1dMCDSPContext *const c) {
 #undef scale_fac
        const int mx0 = get_upscale_x0(src_w, dst_w, dx);

-        call_ref(c_dst, dst_stride, src, src_stride,
-                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
-        call_new(a_dst, dst_stride, src, src_stride,
-                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
-        checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
-                             dst_w, height, "dst");
+        CLEAR_PIXEL_RECT(c_dst);
+        CLEAR_PIXEL_RECT(a_dst);

-        bench_new(a_dst, dst_stride, src, src_stride,
+        call_ref(c_dst, c_dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        call_new(a_dst, a_dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride,
+                                          dst_w, height, "dst", 16, 1);
+
+        bench_new(a_dst, a_dst_stride, src, src_stride,
                  512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
    }