Bug 1906715 - Update dav1d to 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 r=media-playback-reviewers,alwu

Differential Revision: https://phabricator.services.mozilla.com/D216426
2024-07-15 16:14:09 +00:00 · 2024-07-15 16:14:09 +00:00 · a327f20ad9
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00).
+  release: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 (2024-06-26T11:20:43.000+02:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 92f592ed104ba92ad35c781ee93f354525eef503
+  revision: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503"
+#define DAV1D_VERSION "2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1"
--- a/third_party/dav1d/src/arm/64/mc_dotprod.S
+++ b/third_party/dav1d/src/arm/64/mc_dotprod.S
@ -45,32 +45,33 @@ ENABLE_DOTPROD
 #define LOOP_ALIGN      2


-// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
-        .align 4
-L(hv_tbl_neon_dotprod):
-        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
-
-// Shuffle indices to permute horizontal samples in preparation for input to
-// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
-// interval of [-3, 4] relative to the current sample position.
-        .align 4
-L(h_tbl_neon_dotprod):
+const h_tbl_neon_dotprod, align=4
+        // Shuffle indices to permute horizontal samples in preparation for
+        // input to SDOT instructions. The 8-tap horizontal convolution uses
+        // sample indices in the interval of [-3, 4] relative to the current
+        // sample position.
        .byte  0,  1,  2,  3,   1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6
        .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
        .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14

-// Vertical convolutions are also using SDOT instructions, where a 128-bit
-// register contains a transposed 4x4 matrix of values. Subsequent iterations of
-// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
-// iteration. These shuffle indices shift and merge this 4x4 matrix with the
-// values of a new line.
-        .align 4
-L(v_tbl_neon_dotprod):
+        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
+#define OFFSET_CVT_32_8 48
+        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
+endconst
+
+const v_tbl_neon_dotprod, align=4
+        // Vertical convolutions are also using SDOT instructions, where a
+        // 128-bit register contains a transposed 4x4 matrix of values.
+        // Subsequent iterations of the vertical convolution can reuse the
+        // 3x4 sub-matrix from the previous loop iteration. These shuffle
+        // indices shift and merge this 4x4 matrix with the values of a new
+        // line.
        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
+endconst


 .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
@ -109,7 +110,7 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
        .align JUMP_ALIGN
 L(\type\()_8tap_v_\isa):
        madd            \my, \my, w11, w10
-        ldr             q6, L(v_tbl_neon_dotprod)
+        movrel          x13, v_tbl_neon_dotprod
        sub             \src, \src, \s_strd
 .ifc \isa, neon_dotprod
    .ifc \type, prep
@ -121,12 +122,12 @@ L(\type\()_8tap_v_\isa):
 .endif
        ubfx            w11, \my, #7, #7
        and             \my, \my, #0x7F
-        ldr             q28, L(v_tbl_neon_dotprod) + 16
+        ldp             q6, q28, [x13]
        cmp             \h, #4
        csel            \my, \my, w11, le
        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
-        ldr             q29, L(v_tbl_neon_dotprod) + 32
+        ldr             q29, [x13, #32]
 .ifc \isa, neon_dotprod
        movi            v5.16b, #128
 .endif
@ -137,8 +138,7 @@ L(\type\()_8tap_v_\isa):

        // .align JUMP_ALIGN    // fallthrough
 160:    // V - 16xN+
-        ldr             q30, L(v_tbl_neon_dotprod) + 48
-        ldr             q31, L(v_tbl_neon_dotprod) + 64
+        ldp             q30, q31, [x13, #48]
 .ifc \type, prep
        add             \wd_strd, \w, \w
 .endif
@ -676,12 +676,13 @@ L(\type\()_8tap_v_\isa):
 L(\type\()_8tap_h_hv_\isa):
        madd            \mx, \mx, w11, w9
        madd            w14, \my, w11, w10      // for HV
-        ldr             q28, L(h_tbl_neon_dotprod)
 .ifc \isa, neon_dotprod
        mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
        dup             v27.4s, w13             // put H overrides this
 .endif
+        movrel          x13, h_tbl_neon_dotprod
        sub             \src, \src, #3          // src - 3
+        ldr             q28, [x13]
        ubfx            w9, \mx, #7, #7
        and             \mx, \mx, #0x7F
        ubfx            w11, w14, #7, #7        // for HV
@ -702,8 +703,8 @@ L(\type\()_8tap_h_hv_\isa):
        mov             x15, x30
        ldr             d7, [\xmy]
 .ifc \type, put
-        ldr             q25, L(hv_tbl_neon_dotprod)
-.endif
+        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
+.endif                                                 // of 32b values to 8b
        sxtl            v7.8h, v7.8b
        cmp             w10, SHARP1
        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
@ -718,8 +719,7 @@ L(\type\()_8tap_h_hv_\isa):

        // .align JUMP_ALIGN    // fallthrough
 80:     // HV8 - 8xN+
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
 .ifc \type, prep
        add             \wd_strd, \w, \w
@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa):

        .align JUMP_ALIGN
 40:     // HV8 - 4xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
        add             \src, \src, #2

        bl              L(\type\()_hv_filter4_\isa)
@ -930,7 +930,7 @@ L(\type\()_8tap_h_hv_\isa):
 .ifc \type, put
        .align JUMP_ALIGN
 20:     // HV8 - 2xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
        add             \src, \src, #2

        bl              L(\type\()_hv_filter4_\isa)
@ -1005,13 +1005,11 @@ L(\type\()_6tap_hv_\isa):

        // .align JUMP_ALIGN    // fallthrough
 80:     // HV6 - 8xN+
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
 .ifc \type, prep
        add             \wd_strd, \w, \w
 .endif
-
        .align LOOP_ALIGN
 81:
        mov             \lsrc, \src
@ -1145,7 +1143,7 @@ L(\type\()_hv_filter4_\isa):

        .align JUMP_ALIGN
 40:     // HV6 - 4xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
        add             \src, \src, #2

        bl              L(\type\()_hv_filter4_\isa)
@ -1206,7 +1204,7 @@ L(\type\()_hv_filter4_\isa):
 .ifc \type, put
        .align JUMP_ALIGN
 20:     // HV6 - 2xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
        add             \src, \src, #2

        bl              L(\type\()_hv_filter4_\isa)
@ -1284,7 +1282,7 @@ L(\type\()_8tap_h_\isa):
 20:     // H - 2xN
        AARCH64_VALID_JUMP_TARGET
        add             \src, \src, #2
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]

        .align LOOP_ALIGN
 2:
@ -1321,7 +1319,7 @@ L(\type\()_8tap_h_\isa):
 40:     // H - 4xN
        AARCH64_VALID_JUMP_TARGET
        add             \src, \src, #2
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]

        .align LOOP_ALIGN
 4:
@ -1370,8 +1368,7 @@ L(\type\()_8tap_h_\isa):
        .align JUMP_ALIGN
 80:     // H - 8xN
        AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]

        .align LOOP_ALIGN
@ -1436,14 +1433,13 @@ L(\type\()_8tap_h_\isa):
        .align JUMP_ALIGN
 160:    // H - 16xN
        AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]

        .align LOOP_ALIGN
 16:
        ldr             q16, [\src]
-        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
+        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
        add             \src, \src, \s_strd
 .ifc \type\()_\isa, prep_neon_i8mm
        movi            v6.4s, #0
@ -1501,8 +1497,7 @@ L(\type\()_8tap_h_\isa):
 640:
 1280:
        AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
        ldr             d26, [\xmx]
 .ifc \type, put
        sub             \d_strd, \d_strd, \w, uxtw
@ -1513,7 +1508,7 @@ L(\type\()_8tap_h_\isa):
        .align LOOP_ALIGN
 32:
        ldr             q16, [\src]
-        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
+        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
        add             \src, \src, #16
 .ifc \type\()_\isa, prep_neon_i8mm
        movi            v6.4s, #0
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@ -104,6 +104,52 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
    return flags;
 }

+#elif defined(__OpenBSD__)
+
+#if ARCH_AARCH64
+#include <machine/armreg.h>
+#include <machine/cpu.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+     unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+
+#ifdef CPU_ID_AA64ISAR0
+     int mib[2];
+     uint64_t isar0;
+     uint64_t isar1;
+     size_t len;
+
+     mib[0] = CTL_MACHDEP;
+     mib[1] = CPU_ID_AA64ISAR0;
+     len = sizeof(isar0);
+     if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) {
+         if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL)
+             flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+     }
+
+     mib[0] = CTL_MACHDEP;
+     mib[1] = CPU_ID_AA64ISAR1;
+     len = sizeof(isar1);
+     if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) {
+#ifdef ID_AA64ISAR1_I8MM_IMPL
+         if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL)
+             flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
+     }
+#endif
+
+     return flags;
+}
+#else  /* !ARCH_AARCH64 */
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    return flags;
+}
+#endif /* ARCH_AARCH64 */
+
 #elif defined(_WIN32)
 #include <windows.h>

--- a/third_party/dav1d/src/arm/itx.h
+++ b/third_party/dav1d/src/arm/itx.h
@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));

-static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc,
+                                           int *const all_simd)
+{
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
    assign_itx1_fn (R, 64, 16, neon);
    assign_itx1_fn (R, 64, 32, neon);
    assign_itx1_fn ( , 64, 64, neon);
+    *all_simd = 1;
 }
--- a/third_party/dav1d/src/itx_1d.c
+++ b/third_party/dav1d/src/itx_1d.c
@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
    c[3 * stride] = CLIP(t0 - t3);
 }

-void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
-                         const int min, const int max)
+static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
    inv_dct4_1d_internal_c(c, stride, min, max, 0);
 }
@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
    c[7 * stride] = CLIP(t0 - t7);
 }

-void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
-                         const int min, const int max)
+static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
    inv_dct8_1d_internal_c(c, stride, min, max, 0);
 }
@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
    c[15 * stride] = CLIP(t0 - t15a);
 }

-void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
    inv_dct16_1d_internal_c(c, stride, min, max, 0);
 }
@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
    c[31 * stride] = CLIP(t0  - t31);
 }

-void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
    inv_dct32_1d_internal_c(c, stride, min, max, 0);
 }

-void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
    assert(stride > 0);
    inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
 }

 #define inv_adst_1d(sz) \
-void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
-                               const int min, const int max) \
+static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                const int min, const int max) \
 { \
    inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
 } \
-void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
-                                   const int min, const int max) \
+static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                          const int min, const int max) \
 { \
    inv_adst##sz##_1d_internal_c(c, stride, min, max, \
                                 &c[(sz - 1) * stride], -stride); \
@ -980,8 +980,8 @@ inv_adst_1d(16)

 #undef inv_adst_1d

-void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
-                              const int min, const int max)
+static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
    assert(stride > 0);
    for (int i = 0; i < 4; i++) {
@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
    }
 }

-void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
-                              const int min, const int max)
+static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
    assert(stride > 0);
    for (int i = 0; i < 8; i++)
        c[stride * i] *= 2;
 }

-void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
-                               const int min, const int max)
+static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                                const int min, const int max)
 {
    assert(stride > 0);
    for (int i = 0; i < 16; i++) {
@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
    }
 }

-void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
-                               const int min, const int max)
+static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                                const int min, const int max)
 {
    assert(stride > 0);
    for (int i = 0; i < 32; i++)
        c[stride * i] *= 4;
 }

+const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = {
+    [TX_4X4] = {
+        [DCT] = inv_dct4_1d_c,
+        [ADST] = inv_adst4_1d_c,
+        [FLIPADST] = inv_flipadst4_1d_c,
+        [IDENTITY] = inv_identity4_1d_c,
+    }, [TX_8X8] = {
+        [DCT] = inv_dct8_1d_c,
+        [ADST] = inv_adst8_1d_c,
+        [FLIPADST] = inv_flipadst8_1d_c,
+        [IDENTITY] = inv_identity8_1d_c,
+    }, [TX_16X16] = {
+        [DCT] = inv_dct16_1d_c,
+        [ADST] = inv_adst16_1d_c,
+        [FLIPADST] = inv_flipadst16_1d_c,
+        [IDENTITY] = inv_identity16_1d_c,
+    }, [TX_32X32] = {
+        [DCT] = inv_dct32_1d_c,
+        [IDENTITY] = inv_identity32_1d_c,
+    }, [TX_64X64] = {
+        [DCT] = inv_dct64_1d_c,
+    },
+};
+
+const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = {
+    [DCT_DCT]           = { DCT, DCT },
+    [ADST_DCT]          = { ADST, DCT },
+    [DCT_ADST]          = { DCT, ADST },
+    [ADST_ADST]         = { ADST, ADST },
+    [FLIPADST_DCT]      = { FLIPADST, DCT },
+    [DCT_FLIPADST]      = { DCT, FLIPADST },
+    [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
+    [ADST_FLIPADST]     = { ADST, FLIPADST },
+    [FLIPADST_ADST]     = { FLIPADST, ADST },
+    [IDTX]              = { IDENTITY, IDENTITY },
+    [V_DCT]             = { DCT, IDENTITY },
+    [H_DCT]             = { IDENTITY, DCT },
+    [V_ADST]            = { ADST, IDENTITY },
+    [H_ADST]            = { IDENTITY, ADST },
+    [V_FLIPADST]        = { FLIPADST, IDENTITY },
+    [H_FLIPADST]        = { IDENTITY, FLIPADST },
+};
+
 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
  ARCH_AARCH64 || \
  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
--- a/third_party/dav1d/src/itx_1d.h
+++ b/third_party/dav1d/src/itx_1d.h
@ -28,31 +28,25 @@
 #include <stddef.h>
 #include <stdint.h>

+#include "src/levels.h"
+
 #ifndef DAV1D_SRC_ITX_1D_H
 #define DAV1D_SRC_ITX_1D_H

+enum Tx1dType {
+    DCT,
+    ADST,
+    IDENTITY,
+    FLIPADST,
+    N_TX_1D_TYPES,
+};
+
 #define decl_itx_1d_fn(name) \
 void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
 typedef decl_itx_1d_fn(*itx_1d_fn);

-decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
-decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
-decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
-decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
-decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES];
+EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2];

 void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);

--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@ -29,6 +29,7 @@

 #include <stddef.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>

 #include "common/attributes.h"
@ -36,13 +37,17 @@

 #include "src/itx.h"
 #include "src/itx_1d.h"
+#include "src/scan.h"
+#include "src/tables.h"

 static NOINLINE void
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
-               const int eob, const int w, const int h, const int shift,
-               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
-               const int has_dconly HIGHBD_DECL_SUFFIX)
+               const int eob, const /*enum RectTxfmSize*/ int tx, const int shift,
+               const enum TxfmType txtp HIGHBD_DECL_SUFFIX)
 {
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int w = 4 * t_dim->w, h = 4 * t_dim->h;
+    const int has_dconly = txtp == DCT_DCT;
    assert(w >= 4 && w <= 64);
    assert(h >= 4 && h <= 64);
    assert(eob >= 0);
@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
        return;
    }

+    const uint8_t *const txtps = dav1d_tx1d_types[txtp];
+    const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
+    const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
    const int sh = imin(h, 32), sw = imin(w, 32);
 #if BITDEPTH == 8
    const int row_clip_min = INT16_MIN;
@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
    const int col_clip_max = ~col_clip_min;

    int32_t tmp[64 * 64], *c = tmp;
-    for (int y = 0; y < sh; y++, c += w) {
+    int last_nonzero_col; // in first 1d itx
+    if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
+        last_nonzero_col = imin(sh - 1, eob);
+    } else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
+        last_nonzero_col = eob >> (t_dim->lw + 2);
+    } else {
+        last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
+    }
+    assert(last_nonzero_col < sh);
+    for (int y = 0; y <= last_nonzero_col; y++, c += w) {
        if (is_rect2)
            for (int x = 0; x < sw; x++)
                c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
                c[x] = coeff[y + x * sh];
        first_1d_fn(c, 1, row_clip_min, row_clip_max);
    }
+    if (last_nonzero_col + 1 < sh)
+        memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);

    memset(coeff, 0, sizeof(*coeff) * sw * sh);
    for (int i = 0; i < w * sh; i++)
@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
            dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
 }

-#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
+#define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \
 static void \
 inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
                                               const ptrdiff_t stride, \
@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
                                               const int eob \
                                               HIGHBD_DECL_SUFFIX) \
 { \
-    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
-                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
-                   has_dconly HIGHBD_TAIL_SUFFIX); \
+    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
+                   HIGHBD_TAIL_SUFFIX); \
 }

-#define inv_txfm_fn64(w, h, shift) \
-inv_txfm_fn(dct, dct, w, h, shift, 1)
+#define inv_txfm_fn64(pfx, w, h, shift) \
+inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift)

-#define inv_txfm_fn32(w, h, shift) \
-inv_txfm_fn64(w, h, shift) \
-inv_txfm_fn(identity, identity, w, h, shift, 0)
+#define inv_txfm_fn32(pfx, w, h, shift) \
+inv_txfm_fn64(pfx, w, h, shift) \
+inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift)

-#define inv_txfm_fn16(w, h, shift) \
-inv_txfm_fn32(w, h, shift) \
-inv_txfm_fn(adst,     dct,      w, h, shift, 0) \
-inv_txfm_fn(dct,      adst,     w, h, shift, 0) \
-inv_txfm_fn(adst,     adst,     w, h, shift, 0) \
-inv_txfm_fn(dct,      flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, dct,      w, h, shift, 0) \
-inv_txfm_fn(adst,     flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, adst,     w, h, shift, 0) \
-inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
-inv_txfm_fn(identity, dct,      w, h, shift, 0) \
-inv_txfm_fn(dct,      identity, w, h, shift, 0) \
+#define inv_txfm_fn16(pfx, w, h, shift) \
+inv_txfm_fn32(pfx, w, h, shift) \
+inv_txfm_fn(adst,     dct,      ADST_DCT,          pfx,  w, h, shift) \
+inv_txfm_fn(dct,      adst,     DCT_ADST,          pfx, w, h, shift) \
+inv_txfm_fn(adst,     adst,     ADST_ADST,         pfx, w, h, shift) \
+inv_txfm_fn(dct,      flipadst, DCT_FLIPADST,      pfx, w, h, shift) \
+inv_txfm_fn(flipadst, dct,      FLIPADST_DCT,      pfx, w, h, shift) \
+inv_txfm_fn(adst,     flipadst, ADST_FLIPADST,     pfx, w, h, shift) \
+inv_txfm_fn(flipadst, adst,     FLIPADST_ADST,     pfx, w, h, shift) \
+inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(identity, dct,      H_DCT,             pfx, w, h, shift) \
+inv_txfm_fn(dct,      identity, V_DCT,             pfx, w, h, shift) \

-#define inv_txfm_fn84(w, h, shift) \
-inv_txfm_fn16(w, h, shift) \
-inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
-inv_txfm_fn(identity, adst,     w, h, shift, 0) \
-inv_txfm_fn(adst,     identity, w, h, shift, 0) \
+#define inv_txfm_fn84(pfx, w, h, shift) \
+inv_txfm_fn16(pfx, w, h, shift) \
+inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(identity, adst,     H_ADST,     pfx, w, h, shift) \
+inv_txfm_fn(adst,     identity, V_ADST,     pfx, w, h, shift) \

-inv_txfm_fn84( 4,  4, 0)
-inv_txfm_fn84( 4,  8, 0)
-inv_txfm_fn84( 4, 16, 1)
-inv_txfm_fn84( 8,  4, 0)
-inv_txfm_fn84( 8,  8, 1)
-inv_txfm_fn84( 8, 16, 1)
-inv_txfm_fn32( 8, 32, 2)
-inv_txfm_fn84(16,  4, 1)
-inv_txfm_fn84(16,  8, 1)
-inv_txfm_fn16(16, 16, 2)
-inv_txfm_fn32(16, 32, 1)
-inv_txfm_fn64(16, 64, 2)
-inv_txfm_fn32(32,  8, 2)
-inv_txfm_fn32(32, 16, 1)
-inv_txfm_fn32(32, 32, 2)
-inv_txfm_fn64(32, 64, 1)
-inv_txfm_fn64(64, 16, 2)
-inv_txfm_fn64(64, 32, 1)
-inv_txfm_fn64(64, 64, 2)
+inv_txfm_fn84( ,  4,  4, 0)
+inv_txfm_fn84(R,  4,  8, 0)
+inv_txfm_fn84(R,  4, 16, 1)
+inv_txfm_fn84(R,  8,  4, 0)
+inv_txfm_fn84( ,  8,  8, 1)
+inv_txfm_fn84(R,  8, 16, 1)
+inv_txfm_fn32(R,  8, 32, 2)
+inv_txfm_fn84(R, 16,  4, 1)
+inv_txfm_fn84(R, 16,  8, 1)
+inv_txfm_fn16( , 16, 16, 2)
+inv_txfm_fn32(R, 16, 32, 1)
+inv_txfm_fn64(R, 16, 64, 2)
+inv_txfm_fn32(R, 32,  8, 2)
+inv_txfm_fn32(R, 32, 16, 1)
+inv_txfm_fn32( , 32, 32, 2)
+inv_txfm_fn64(R, 32, 64, 1)
+inv_txfm_fn64(R, 64, 16, 2)
+inv_txfm_fn64(R, 64, 32, 1)
+inv_txfm_fn64( , 64, 64, 2)

 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
  ARCH_AARCH64 || \
@ -267,9 +285,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
    assign_itx_all_fn64(64, 32, R);
    assign_itx_all_fn64(64, 64, );

+    int all_simd = 0;
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
-    itx_dsp_init_arm(c, bpc);
+    itx_dsp_init_arm(c, bpc, &all_simd);
 #endif
 #if ARCH_LOONGARCH64
    itx_dsp_init_loongarch(c, bpc);
@ -278,7 +297,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
    itx_dsp_init_riscv(c, bpc);
 #endif
 #if ARCH_X86
-    itx_dsp_init_x86(c, bpc);
+    itx_dsp_init_x86(c, bpc, &all_simd);
 #endif
 #endif
+
+    if (!all_simd)
+        dav1d_init_last_nonzero_col_from_eob_tables();
 }
--- a/third_party/dav1d/src/scan.c
+++ b/third_party/dav1d/src/scan.c
@ -28,7 +28,10 @@
 #include "config.h"

 #include "common/attributes.h"
+#include "common/intops.h"
+
 #include "src/scan.h"
+#include "src/thread.h"

 static const uint16_t ALIGN(scan_4x4[], 32) = {
     0,  4,  1,  2,
@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
    [RTX_16X64] = scan_16x32,
    [RTX_64X16] = scan_32x16,
 };
+
+static uint8_t last_nonzero_col_from_eob_4x4[16];
+static uint8_t last_nonzero_col_from_eob_8x8[64];
+static uint8_t last_nonzero_col_from_eob_16x16[256];
+static uint8_t last_nonzero_col_from_eob_32x32[1024];
+static uint8_t last_nonzero_col_from_eob_4x8[32];
+static uint8_t last_nonzero_col_from_eob_8x4[32];
+static uint8_t last_nonzero_col_from_eob_8x16[128];
+static uint8_t last_nonzero_col_from_eob_16x8[128];
+static uint8_t last_nonzero_col_from_eob_16x32[512];
+static uint8_t last_nonzero_col_from_eob_32x16[512];
+static uint8_t last_nonzero_col_from_eob_4x16[64];
+static uint8_t last_nonzero_col_from_eob_16x4[64];
+static uint8_t last_nonzero_col_from_eob_8x32[256];
+static uint8_t last_nonzero_col_from_eob_32x8[256];
+
+static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob,
+                          const uint16_t *const scan, const int w, const int h)
+{
+    int max_col = 0;
+    for (int y = 0, n = 0; y < h; y++) {
+        for (int x = 0; x < w; x++, n++) {
+            const int rc = scan[n];
+            const int rcx = rc & (h - 1);
+            max_col = imax(max_col, rcx);
+            last_nonzero_col_from_eob[n] = max_col;
+        }
+    }
+}
+
+static COLD void init_internal(void) {
+    init_tbl(last_nonzero_col_from_eob_4x4,   scan_4x4,    4,  4);
+    init_tbl(last_nonzero_col_from_eob_8x8,   scan_8x8,    8,  8);
+    init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
+    init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
+    init_tbl(last_nonzero_col_from_eob_4x8,   scan_4x8,    4,  8);
+    init_tbl(last_nonzero_col_from_eob_8x4,   scan_8x4,    8,  4);
+    init_tbl(last_nonzero_col_from_eob_8x16,  scan_8x16,   8, 16);
+    init_tbl(last_nonzero_col_from_eob_16x8,  scan_16x8,  16,  8);
+    init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
+    init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
+    init_tbl(last_nonzero_col_from_eob_4x16,  scan_4x16,   4, 16);
+    init_tbl(last_nonzero_col_from_eob_16x4,  scan_16x4,  16,  4);
+    init_tbl(last_nonzero_col_from_eob_8x32,  scan_8x32,   8, 32);
+    init_tbl(last_nonzero_col_from_eob_32x8,  scan_32x8,  32,  8);
+}
+
+COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
+    static pthread_once_t initted = PTHREAD_ONCE_INIT;
+    pthread_once(&initted, init_internal);
+}
+
+const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = {
+    [ TX_4X4  ] = last_nonzero_col_from_eob_4x4,
+    [ TX_8X8  ] = last_nonzero_col_from_eob_8x8,
+    [ TX_16X16] = last_nonzero_col_from_eob_16x16,
+    [ TX_32X32] = last_nonzero_col_from_eob_32x32,
+    [ TX_64X64] = last_nonzero_col_from_eob_32x32,
+    [RTX_4X8  ] = last_nonzero_col_from_eob_4x8,
+    [RTX_8X4  ] = last_nonzero_col_from_eob_8x4,
+    [RTX_8X16 ] = last_nonzero_col_from_eob_8x16,
+    [RTX_16X8 ] = last_nonzero_col_from_eob_16x8,
+    [RTX_16X32] = last_nonzero_col_from_eob_16x32,
+    [RTX_32X16] = last_nonzero_col_from_eob_32x16,
+    [RTX_32X64] = last_nonzero_col_from_eob_32x32,
+    [RTX_64X32] = last_nonzero_col_from_eob_32x32,
+    [RTX_4X16 ] = last_nonzero_col_from_eob_4x16,
+    [RTX_16X4 ] = last_nonzero_col_from_eob_16x4,
+    [RTX_8X32 ] = last_nonzero_col_from_eob_8x32,
+    [RTX_32X8 ] = last_nonzero_col_from_eob_32x8,
+    [RTX_16X64] = last_nonzero_col_from_eob_16x32,
+    [RTX_64X16] = last_nonzero_col_from_eob_32x16,
+};
--- a/third_party/dav1d/src/scan.h
+++ b/third_party/dav1d/src/scan.h
@ -33,5 +33,8 @@
 #include "src/levels.h"

 EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
+EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES];
+
+void dav1d_init_last_nonzero_col_from_eob_tables(void);

 #endif /* DAV1D_SRC_SCAN_H */
--- a/third_party/dav1d/src/x86/itx.h
+++ b/third_party/dav1d/src/x86/itx.h
@ -107,7 +107,9 @@ decl_itx_fns(ssse3);
 decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
 decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));

-static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
+                                           const int bpc, int *const all_simd)
+{
 #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
        BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
    assign_itx1_fn (R, 64, 16, ssse3);
    assign_itx1_fn (R, 64, 32, ssse3);
    assign_itx1_fn ( , 64, 64, ssse3);
+    *all_simd = 1;
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
        assign_itx1_fn (R, 64, 16, sse4);
        assign_itx1_fn (R, 64, 32, sse4);
        assign_itx1_fn (,  64, 64, sse4);
+        *all_simd = 1;
    }
 #endif

--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm