Bug 1875883 - Update dav1d to a6878be7e07114f5a2915ad46300700f0db55197 r=media-playback-reviewers,padenot

Differential Revision: https://phabricator.services.mozilla.com/D200241
2024-02-02 18:17:49 +00:00 · 2024-02-02 18:17:49 +00:00 · 8966466027
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc (2023-12-19T13:15:43.000+01:00).
+  release: a6878be7e07114f5a2915ad46300700f0db55197 (2024-01-31T06:04:21.000-05:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc
+  revision: a6878be7e07114f5a2915ad46300700f0db55197

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc"
+#define DAV1D_VERSION "a6878be7e07114f5a2915ad46300700f0db55197"
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@ -60,7 +60,7 @@
 #define ALIGN_64_VAL 64
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
-#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
+#elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32
 /* ARM doesn't benefit from anything more than 16-byte alignment. */
 #define ALIGN_64_VAL 16
 #define ALIGN_32_VAL 16
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -62,11 +62,13 @@ endforeach

 # ASM option
 is_asm_enabled = (get_option('enable_asm') == true and
-    (host_machine.cpu_family() == 'x86' or
-     (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
-     host_machine.cpu_family() == 'aarch64'      or
+    (host_machine.cpu_family() == 'aarch64' or
     host_machine.cpu_family().startswith('arm') or
-     host_machine.cpu() == 'ppc64le'))
+     host_machine.cpu() == 'ppc64le' or
+     host_machine.cpu_family().startswith('riscv') or
+     host_machine.cpu_family().startswith('loongarch') or
+     host_machine.cpu_family() == 'x86' or
+     (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '')))
 cdata.set10('HAVE_ASM', is_asm_enabled)

 if is_asm_enabled and get_option('b_sanitize') == 'memory'
@ -232,7 +234,9 @@ endif

 if (host_machine.cpu_family() == 'aarch64' or
    host_machine.cpu_family().startswith('arm') or
-    host_machine.cpu() == 'ppc64le')
+    host_machine.cpu_family().startswith('loongarch') or
+    host_machine.cpu() == 'ppc64le' or
+    host_machine.cpu_family().startswith('riscv'))
    if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
        cdata.set('HAVE_GETAUXVAL', 1)
    endif
@ -379,6 +383,14 @@ endif

 cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')

+cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv'))
+cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32')
+cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64')
+
+cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch'))
+cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32')
+cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64')
+
 # meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
 # when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
 # see following meson issue https://github.com/mesonbuild/meson/issues/5482
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@ -25,6 +25,11 @@ option('enable_tests',
    value: true,
    description: 'Build dav1d tests')

+option('enable_seek_stress',
+    type: 'boolean',
+    value: false,
+    description: 'Build seek_stress test tool')
+
 option('enable_docs',
    type: 'boolean',
    value: false,
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@ -56,8 +56,12 @@ COLD void dav1d_init_cpu(void) {
 // memory sanitizer is inherently incompatible with asm
 #if ARCH_AARCH64 || ARCH_ARM
    dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_LOONGARCH
+    dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
 #elif ARCH_PPC64LE
    dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_RISCV
+    dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
 #elif ARCH_X86
    dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
 #endif
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@ -37,8 +37,12 @@

 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/cpu.h"
+#elif ARCH_LOONGARCH
+#include "src/loongarch/cpu.h"
 #elif ARCH_PPC64LE
 #include "src/ppc/cpu.h"
+#elif ARCH_RISCV
+#include "src/riscv/cpu.h"
 #elif ARCH_X86
 #include "src/x86/cpu.h"
 #endif
@ -64,6 +68,10 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
 #if defined(__VSX__)
    flags |= DAV1D_PPC_CPU_FLAG_VSX;
 #endif
+#elif ARCH_RISCV
+#if defined(__riscv_v)
+    flags |= DAV1D_RISCV_CPU_FLAG_V;
+#endif
 #elif ARCH_X86
 #if defined(__AVX512F__) && defined(__AVX512CD__) && \
    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@ -183,6 +183,10 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/itx.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/itx.h"
+#elif ARCH_RISCV
+#include "src/riscv/itx.h"
 #elif ARCH_X86
 #include "src/x86/itx.h"
 #endif
@ -257,6 +261,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
 #if ARCH_AARCH64 || ARCH_ARM
    itx_dsp_init_arm(c, bpc);
 #endif
+#if ARCH_LOONGARCH64
+    itx_dsp_init_loongarch(c, bpc);
+#endif
+#if ARCH_RISCV
+    itx_dsp_init_riscv(c, bpc);
+#endif
 #if ARCH_X86
    itx_dsp_init_x86(c, bpc);
 #endif
--- a/third_party/dav1d/src/loongarch/cpu.c
+++ b/third_party/dav1d/src/loongarch/cpu.c
@ -0,0 +1,47 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "common/attributes.h"
+#include "src/loongarch/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX    ( 1 << 4 )
+#define LA_HWCAP_LASX   ( 1 << 5 )
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
+    unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
+    flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
+#endif
+
+    return flags;
+}
--- a/third_party/dav1d/src/loongarch/cpu.h
+++ b/third_party/dav1d/src/loongarch/cpu.h
@ -0,0 +1,37 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_CPU_H
+#define DAV1D_SRC_LOONGARCH_CPU_H
+
+enum CpuFlags {
+    DAV1D_LOONGARCH_CPU_FLAG_LSX  = 1 << 0,
+    DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1,
+};
+
+unsigned dav1d_get_cpu_flags_loongarch(void);
+
+#endif /* DAV1D_SRC_LOONGARCH_CPU_H */
--- a/third_party/dav1d/src/loongarch/itx.S
+++ b/third_party/dav1d/src/loongarch/itx.S
--- a/third_party/dav1d/src/loongarch/itx.h
+++ b/third_party/dav1d/src/loongarch/itx.h
@ -0,0 +1,195 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_ITX_H
+#define DAV1D_SRC_LOONGARCH_ITX_H
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
+
+static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#if BITDEPTH == 8
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+    if (BITDEPTH != 8 ) return;
+
+    c->itxfm_add[TX_4X4][WHT_WHT]  = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][DCT_DCT]  = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
+    c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
+
+    c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
+
+    c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
+    c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
+
+    c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
+    c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
+
+    c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
+    c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
+    c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
+    c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
+
+    c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
+    c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
+
+    c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
+    c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
+    c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx;
+    c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
+    c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
+    c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
+
+    c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
+
+    c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
+
+    c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_ITX_H */
--- a/third_party/dav1d/src/loongarch/loongson_asm.S
+++ b/third_party/dav1d/src/loongarch/loongson_asm.S
@ -0,0 +1,776 @@
+/*********************************************************************
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ *                Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *********************************************************************/
+
+/*
+ * This file is a LoongArch assembly helper file and available under ISC
+ * license. It provides a large number of macros and alias to simplify
+ * writing assembly code, especially for LSX and LASX optimizations.
+ *
+ * Any one can modify it or add new features for his/her own purposes.
+ * Contributing a patch will be appreciated as it might be useful for
+ * others as well. Send patches to loongson contributor mentioned above.
+ *
+ * MAJOR version: Usage changes, incompatible with previous version.
+ * MINOR version: Add new macros/functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 4
+#define LML_VERSION_MICRO 0
+
+#define DEFAULT_ALIGN    5
+
+/* Set prefix as needed. */
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
+#else
+#define ASM_PREF PRIVATE_PREFIX
+#endif
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+    jirl    $r0, $r1, 0x0
+    .size ASM_PREF\name, . - ASM_PREF\name
+    .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type  ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+.macro  const name, align=DEFAULT_ALIGN
+    .macro endconst
+    .size  \name, . - \name
+    .purgem endconst
+    .endm
+.section .rodata
+.align   \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp   $sp
+#define ra   $ra
+
+#define fa0  $fa0
+#define fa1  $fa1
+#define fa2  $fa2
+#define fa3  $fa3
+#define fa4  $fa4
+#define fa5  $fa5
+#define fa6  $fa6
+#define fa7  $fa7
+#define ft0  $ft0
+#define ft1  $ft1
+#define ft2  $ft2
+#define ft3  $ft3
+#define ft4  $ft4
+#define ft5  $ft5
+#define ft6  $ft6
+#define ft7  $ft7
+#define ft8  $ft8
+#define ft9  $ft9
+#define ft10 $ft10
+#define ft11 $ft11
+#define ft12 $ft12
+#define ft13 $ft13
+#define ft14 $ft14
+#define ft15 $ft15
+#define fs0  $fs0
+#define fs1  $fs1
+#define fs2  $fs2
+#define fs3  $fs3
+#define fs4  $fs4
+#define fs5  $fs5
+#define fs6  $fs6
+#define fs7  $fs7
+
+#define f0  $f0
+#define f1  $f1
+#define f2  $f2
+#define f3  $f3
+#define f4  $f4
+#define f5  $f5
+#define f6  $f6
+#define f7  $f7
+#define f8  $f8
+#define f9  $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+    vmulwev.h.bu      \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+    vmulwev.h.bu.b    \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+    vmulwev.w.h       \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+    xvmulwev.h.bu    \xd,    \xj,    \xk
+    xvmaddwod.h.bu   \xd,    \xj,    \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+    xvmulwev.h.bu.b    \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+    xvmulwev.w.h       \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+    vmaddwev.h.bu     \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+    vmaddwev.h.bu.b   \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+    vmaddwev.w.h      \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+    xvmaddwev.w.h      \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Range element vj[i] to vk[i] ~ vj[i]
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ */
+.macro vclip.h  vd,  vj, vk, va
+    vmax.h    \vd,  \vj,   \vk
+    vmin.h    \vd,  \vd,   \va
+.endm
+
+.macro vclip.w  vd,  vj, vk, va
+    vmax.w    \vd,  \vj,   \vk
+    vmin.w    \vd,  \vd,   \va
+.endm
+
+.macro xvclip.h  xd,  xj, xk, xa
+    xvmax.h    \xd,  \xj,   \xk
+    xvmin.h    \xd,  \xd,   \xa
+.endm
+
+.macro xvclip.w  xd,  xj, xk, xa
+    xvmax.w    \xd,  \xj,   \xk
+    xvmin.w    \xd,  \xd,   \xa
+.endm
+
+/*
+ * Description : Range element vj[i] to 0 ~ 255
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip255.h  vd, vj
+    vmaxi.h   \vd,   \vj,  0
+    vsat.hu   \vd,   \vd,  7
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
+.macro xvclip255.h  xd, xj
+    xvmaxi.h   \xd,   \xj,  0
+    xvsat.hu   \xd,   \xd,  7
+.endm
+
+.macro xvclip255.w  xd, xj
+    xvmaxi.w   \xd,   \xj,  0
+    xvsat.wu   \xd,   \xd,  7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.b   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.h   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.w   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.d  vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.d   \vd,  \rk,  0, \si
+.endm
+
+.macro vmov xd, xj
+    vor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xmov xd, xj
+    xvor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xvstelmx.d  xd, rk, ra, si
+    add.d      \rk, \rk,  \ra
+    xvstelm.d  \xd, \rk,  0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.s     \out0,    \src,    0
+    fldx.s    \out1,    \src,    \stride
+    fldx.s    \out2,    \src,    \stride2
+    fldx.s    \out3,    \src,    \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.d     \out0,    \src,    0
+    fldx.d    \out1,    \src,    \stride
+    fldx.d    \out2,    \src,    \stride2
+    fldx.d    \out3,    \src,    \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    vld     \out0,    \src,    0
+    vldx    \out1,    \src,    \stride
+    vldx    \out2,    \src,    \stride2
+    vldx    \out3,    \src,    \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    xvld    \out0,    \src,    0
+    xvldx   \out1,    \src,    \stride
+    xvldx   \out2,    \src,    \stride2
+    xvldx   \out3,    \src,    \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+    vilvl.h   \tmp0,  \in1,   \in0
+    vilvl.h   \tmp1,  \in3,   \in2
+    vilvl.w   \out0,  \tmp1,  \tmp0
+    vilvh.w   \out2,  \tmp1,  \tmp0
+    vilvh.d   \out1,  \out0,  \out0
+    vilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+
+    vilvl.w    \tmp0,   \in1,    \in0
+    vilvh.w    \out1,   \in1,    \in0
+    vilvl.w    \tmp1,   \in3,    \in2
+    vilvh.w    \out3,   \in3,    \in2
+
+    vilvl.d    \out0,   \tmp1,   \tmp0
+    vilvl.d    \out2,   \out3,   \out1
+    vilvh.d    \out3,   \out3,   \out1
+    vilvh.d    \out1,   \tmp1,   \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+                          tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.h      \tmp0,    \in6,   \in4
+    vilvl.h      \tmp1,    \in7,   \in5
+    vilvl.h      \tmp2,    \in2,   \in0
+    vilvl.h      \tmp3,    \in3,   \in1
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vilvh.h      \tmp0,    \in6,   \in4
+    vilvh.h      \tmp1,    \in7,   \in5
+    vilvh.h      \tmp2,    \in2,   \in0
+    vilvh.h      \tmp3,    \in3,   \in1
+
+    vpickev.d    \out0,    \tmp4,  \tmp6
+    vpickod.d    \out1,    \tmp4,  \tmp6
+    vpickev.d    \out2,    \tmp5,  \tmp7
+    vpickod.d    \out3,    \tmp5,  \tmp7
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vpickev.d    \out4,    \tmp4,  \tmp6
+    vpickod.d    \out5,    \tmp4,  \tmp6
+    vpickev.d    \out6,    \tmp5,  \tmp7
+    vpickod.d    \out7,    \tmp5,  \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5, out6, out7,\
+                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.b   \tmp0,    \in2,     \in0
+    xvilvl.b   \tmp1,    \in3,     \in1
+    xvilvl.b   \tmp2,    \in6,     \in4
+    xvilvl.b   \tmp3,    \in7,     \in5
+    xvilvl.b   \tmp4,    \in10,    \in8
+    xvilvl.b   \tmp5,    \in11,    \in9
+    xvilvl.b   \tmp6,    \in14,    \in12
+    xvilvl.b   \tmp7,    \in15,    \in13
+    xvilvl.b   \out0,    \tmp1,    \tmp0
+    xvilvh.b   \out1,    \tmp1,    \tmp0
+    xvilvl.b   \out2,    \tmp3,    \tmp2
+    xvilvh.b   \out3,    \tmp3,    \tmp2
+    xvilvl.b   \out4,    \tmp5,    \tmp4
+    xvilvh.b   \out5,    \tmp5,    \tmp4
+    xvilvl.b   \out6,    \tmp7,    \tmp6
+    xvilvh.b   \out7,    \tmp7,    \tmp6
+    xvilvl.w   \tmp0,    \out2,    \out0
+    xvilvh.w   \tmp2,    \out2,    \out0
+    xvilvl.w   \tmp4,    \out3,    \out1
+    xvilvh.w   \tmp6,    \out3,    \out1
+    xvilvl.w   \tmp1,    \out6,    \out4
+    xvilvh.w   \tmp3,    \out6,    \out4
+    xvilvl.w   \tmp5,    \out7,    \out5
+    xvilvh.w   \tmp7,    \out7,    \out5
+    xvilvl.d   \out0,    \tmp1,    \tmp0
+    xvilvh.d   \out1,    \tmp1,    \tmp0
+    xvilvl.d   \out2,    \tmp3,    \tmp2
+    xvilvh.d   \out3,    \tmp3,    \tmp2
+    xvilvl.d   \out4,    \tmp5,    \tmp4
+    xvilvh.d   \out5,    \tmp5,    \tmp4
+    xvilvl.d   \out6,    \tmp7,    \tmp6
+    xvilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h   \tmp0,  \in1,   \in0
+    xvilvl.h   \tmp1,  \in3,   \in2
+    xvilvl.w   \out0,  \tmp1,  \tmp0
+    xvilvh.w   \out2,  \tmp1,  \tmp0
+    xvilvh.d   \out1,  \out0,  \out0
+    xvilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h      \tmp0,    \in2,   \in0
+    xvilvl.h      \tmp1,    \in3,   \in1
+    xvilvl.h      \out2,    \tmp1,  \tmp0
+    xvilvh.h      \out3,    \tmp1,  \tmp0
+
+    xvilvl.d      \out0,    \out2,  \out2
+    xvilvh.d      \out1,    \out2,  \out2
+    xvilvl.d      \out2,    \out3,  \out3
+    xvilvh.d      \out3,    \out3,  \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
+                           out0, out1, out2, out3, out4, out5, out6, out7, \
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.h     \tmp0,   \in6,     \in4
+    xvilvl.h     \tmp1,   \in7,     \in5
+    xvilvl.h     \tmp2,   \in2,     \in0
+    xvilvl.h     \tmp3,   \in3,     \in1
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvilvh.h     \tmp0,   \in6,     \in4
+    xvilvh.h     \tmp1,   \in7,     \in5
+    xvilvh.h     \tmp2,   \in2,     \in0
+    xvilvh.h     \tmp3,   \in3,     \in1
+
+    xvpickev.d   \out0,   \tmp4,    \tmp6
+    xvpickod.d   \out1,   \tmp4,    \tmp6
+    xvpickev.d   \out2,   \tmp5,    \tmp7
+    xvpickod.d   \out3,   \tmp5,    \tmp7
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvpickev.d   \out4,   \tmp4,    \tmp6
+    xvpickod.d   \out5,   \tmp4,    \tmp6
+    xvpickev.d   \out6,   \tmp5,    \tmp7
+    xvpickod.d   \out7,   \tmp5,    \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                             tmp0, tmp1, tmp2
+    xvilvh.h   \tmp1,    \in0,     \in1
+    xvilvl.h   \out1,    \in0,     \in1
+    xvilvh.h   \tmp0,    \in2,     \in3
+    xvilvl.h   \out3,    \in2,     \in3
+
+    xvilvh.w   \tmp2,    \out3,    \out1
+    xvilvl.w   \out3,    \out3,    \out1
+
+    xvilvl.w   \out2,    \tmp0,    \tmp1
+    xvilvh.w   \tmp1,    \tmp0,    \tmp1
+
+    xvilvh.d   \out0,    \out2,    \out3
+    xvilvl.d   \out2,    \out2,    \out3
+    xvilvh.d   \out1,    \tmp1,    \tmp2
+    xvilvl.d   \out3,    \tmp1,    \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
+ *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
+ *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+
+    xvilvl.w    \tmp0,   \in1,    \in0
+    xvilvh.w    \out1,   \in1,    \in0
+    xvilvl.w    \tmp1,   \in3,    \in2
+    xvilvh.w    \out3,   \in3,    \in2
+
+    xvilvl.d    \out0,   \tmp1,   \tmp0
+    xvilvl.d    \out2,   \out3,   \out1
+    xvilvh.d    \out3,   \out3,   \out1
+    xvilvh.d    \out1,   \tmp1,   \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *         in0 : 1,2,3,4,5,6,7,8
+ *         in1 : 2,2,3,4,5,6,7,8
+ *         in2 : 3,2,3,4,5,6,7,8
+ *         in3 : 4,2,3,4,5,6,7,8
+ *         in4 : 5,2,3,4,5,6,7,8
+ *         in5 : 6,2,3,4,5,6,7,8
+ *         in6 : 7,2,3,4,5,6,7,8
+ *         in7 : 8,2,3,4,5,6,7,8
+ *
+ *        out0 : 1,2,3,4,5,6,7,8
+ *        out1 : 2,2,2,2,2,2,2,2
+ *        out2 : 3,3,3,3,3,3,3,3
+ *        out3 : 4,4,4,4,4,4,4,4
+ *        out4 : 5,5,5,5,5,5,5,5
+ *        out5 : 6,6,6,6,6,6,6,6
+ *        out6 : 7,7,7,7,7,7,7,7
+ *        out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
+                           out0, out1, out2, out3, out4, out5, out6, out7,\
+                           tmp0, tmp1, tmp2, tmp3
+    xvilvl.w    \tmp0,   \in2,    \in0
+    xvilvl.w    \tmp1,   \in3,    \in1
+    xvilvh.w    \tmp2,   \in2,    \in0
+    xvilvh.w    \tmp3,   \in3,    \in1
+    xvilvl.w    \out0,   \tmp1,   \tmp0
+    xvilvh.w    \out1,   \tmp1,   \tmp0
+    xvilvl.w    \out2,   \tmp3,   \tmp2
+    xvilvh.w    \out3,   \tmp3,   \tmp2
+
+    xvilvl.w    \tmp0,   \in6,    \in4
+    xvilvl.w    \tmp1,   \in7,    \in5
+    xvilvh.w    \tmp2,   \in6,    \in4
+    xvilvh.w    \tmp3,   \in7,    \in5
+    xvilvl.w    \out4,   \tmp1,   \tmp0
+    xvilvh.w    \out5,   \tmp1,   \tmp0
+    xvilvl.w    \out6,   \tmp3,   \tmp2
+    xvilvh.w    \out7,   \tmp3,   \tmp2
+
+    xmov        \tmp0,   \out0
+    xmov        \tmp1,   \out1
+    xmov        \tmp2,   \out2
+    xmov        \tmp3,   \out3
+    xvpermi.q   \out0,   \out4,   0x02
+    xvpermi.q   \out1,   \out5,   0x02
+    xvpermi.q   \out2,   \out6,   0x02
+    xvpermi.q   \out3,   \out7,   0x02
+    xvpermi.q   \out4,   \tmp0,   0x31
+    xvpermi.q   \out5,   \tmp1,   0x31
+    xvpermi.q   \out6,   \tmp2,   0x31
+    xvpermi.q   \out7,   \tmp3,   0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *         in0 : 1,2,3,4
+ *         in1 : 1,2,3,4
+ *         in2 : 1,2,3,4
+ *         in3 : 1,2,3,4
+ *
+ *        out0 : 1,1,1,1
+ *        out1 : 2,2,2,2
+ *        out2 : 3,3,3,3
+ *        out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.d    \tmp0,   \in1,    \in0
+    xvilvh.d    \out1,   \in1,    \in0
+    xvilvh.d    \tmp1,   \in3,    \in2
+    xvilvl.d    \out2,   \in3,    \in2
+
+    xvor.v      \out0,   \tmp0,   \tmp0
+    xvor.v      \out3,   \tmp1,   \tmp1
+
+    xvpermi.q   \out0,   \out2,   0x02
+    xvpermi.q   \out2,   \tmp0,   0x31
+    xvpermi.q   \out3,   \out1,   0x31
+    xvpermi.q   \out1,   \tmp1,   0x02
+.endm
--- a/third_party/dav1d/src/loongarch/loopfilter.S
+++ b/third_party/dav1d/src/loongarch/loopfilter.S
--- a/third_party/dav1d/src/loongarch/loopfilter.h
+++ b/third_party/dav1d/src/loongarch/loopfilter.h
@ -0,0 +1,52 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+    c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx);
+    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx);
+    c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx);
+    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */
--- a/third_party/dav1d/src/loongarch/looprestoration.S
+++ b/third_party/dav1d/src/loongarch/looprestoration.S
--- a/third_party/dav1d/src/loongarch/looprestoration.h
+++ b/third_party/dav1d/src/loongarch/looprestoration.h
@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+
+#include "common/intops.h"
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride,
+                             const uint8_t (*const left)[4],
+                             const uint8_t *lpf,
+                             const int w, const int h,
+                             const LooprestorationParams *const params,
+                             const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc)
+{
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+    c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx;
+
+    c->sgr[0] = dav1d_sgr_filter_5x5_lsx;
+    c->sgr[1] = dav1d_sgr_filter_3x3_lsx;
+    c->sgr[2] = dav1d_sgr_filter_mix_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */
--- a/third_party/dav1d/src/loongarch/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/loongarch/looprestoration_tmpl.c
@ -0,0 +1,274 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
+                                    uint8_t *tmp_ptr,
+                                    const int16_t filterh[8],
+                                    const int w, const int h);
+
+void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
+                                    const ptrdiff_t p_stride,
+                                    const int32_t *hor,
+                                    const int16_t filterv[8],
+                                    const int w, const int h);
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+static inline void padding(uint8_t *dst, const uint8_t *p,
+                           const ptrdiff_t stride, const uint8_t (*left)[4],
+                           const uint8_t *lpf, int unit_w, const int stripe_h,
+                           const enum LrEdgeFlags edges)
+{
+    const int have_left = !!(edges & LR_HAVE_LEFT);
+    const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+    // Copy more pixels if we don't have to pad them
+    unit_w += 3 * have_left + 3 * have_right;
+    uint8_t *dst_l = dst + 3 * !have_left;
+    p -= 3 * have_left;
+    lpf -= 3 * have_left;
+
+    if (edges & LR_HAVE_TOP) {
+        // Copy previous loop filtered rows
+        const uint8_t *const above_1 = lpf;
+        const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+        pixel_copy(dst_l, above_1, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+    } else {
+        // Pad with first row
+        pixel_copy(dst_l, p, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
+    }
+
+    uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+    if (edges & LR_HAVE_BOTTOM) {
+        // Copy next loop filtered rows
+        const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+        const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+    } else {
+        // Pad with last row
+        const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
+    }
+
+    // Inner UNIT_WxSTRIPE_H
+    for (int j = 0; j < stripe_h; j++) {
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+        dst_tl += REST_UNIT_STRIDE;
+        p += PXSTRIDE(stride);
+    }
+
+    if (!have_right) {
+        uint8_t *pad = dst_l + unit_w;
+        uint8_t *row_last = &dst_l[unit_w - 1];
+        // Pad 3x(STRIPE_H+6) with last column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(pad, *row_last, 3);
+            pad += REST_UNIT_STRIDE;
+            row_last += REST_UNIT_STRIDE;
+        }
+    }
+
+    if (!have_left) {
+        // Pad 3x(STRIPE_H+6) with first column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(dst, *dst_l, 3);
+            dst += REST_UNIT_STRIDE;
+            dst_l += REST_UNIT_STRIDE;
+        }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
+    }
+}
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
+                              const uint8_t (*const left)[4],
+                              const uint8_t *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    const int16_t (*const filter)[8] = params->filter;
+
+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, w, h, edges);
+    ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+    BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
+    BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
+}
+
+void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
+                              const int w, const int h);
+void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
+                              const int w, const int h);
+
+void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+                          const int w, const int h, const int w1);
+void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
+                          int32_t *sumsq, int16_t *sum,
+                          const int w, const int h);
+void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
+                                    int16_t *dst, int w1,
+                                    const int w, const int h);
+
+
+static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
+                               const int w, const int h)
+{
+    BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+    BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, w, h, edges);
+    coef dst[64 * 384];
+
+    ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+    ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+    boxsum3_lsx(sumsq, sum, tmp, w, h);
+    BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
+    BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+    BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
+}
+
+void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
+                              const uint8_t *const src,
+                              const int w, const int h);
+
+void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
+                              const int w, const int h);
+
+void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+                                  const int w, const int h,
+                                  const unsigned s);
+
+void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
+                                  int32_t *sumsq, int16_t *sum,
+                                  const int w, const int h);
+
+void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
+                                   const int16_t *dst0, const int16_t *dst1,
+                                   const int w0, const int w1,
+                                   const int w, const int h);
+
+static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
+                               const int w, const int h)
+{
+    BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+    BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, w, h, edges);
+    coef dst[64 * 384];
+
+    ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+    ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+    boxsum5_lsx(sumsq, sum, tmp, w, h);
+    BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
+    BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+    BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
+}
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+                              const pixel (*const left)[4],
+                              const pixel *lpf,
+                              const int w, const int h,
+                              const LooprestorationParams *const params,
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, w, h, edges);
+    coef dst0[64 * 384];
+    coef dst1[64 * 384];
+
+    ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
+    ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
+
+    boxsum5_lsx(sumsq0, sum0, tmp, w, h);
+    BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
+    BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
+
+    boxsum3_lsx(sumsq0, sum0, tmp, w, h);
+    BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
+    BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
+
+    BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
+                                   params->sgr.w1, w, h);
+}
+#endif
--- a/third_party/dav1d/src/loongarch/mc.S
+++ b/third_party/dav1d/src/loongarch/mc.S
--- a/third_party/dav1d/src/loongarch/mc.h
+++ b/third_party/dav1d/src/loongarch/mc.h
@ -0,0 +1,118 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MC_H
+#define DAV1D_SRC_LOONGARCH_MC_H
+
+#include "config.h"
+#include "src/mc.h"
+#include "src/cpu.h"
+
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = BF(dav1d_prep_##name, suffix)
+
+decl_avg_fn(BF(dav1d_avg, lsx));
+decl_w_avg_fn(BF(dav1d_w_avg, lsx));
+decl_mask_fn(BF(dav1d_mask, lsx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
+
+decl_mc_fn(BF(dav1d_put_8tap_regular,          lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth,   lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp,    lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth,           lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular,   lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp,     lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp,            lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular,    lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth,     lsx));
+
+decl_avg_fn(BF(dav1d_avg, lasx));
+decl_w_avg_fn(BF(dav1d_w_avg, lasx));
+decl_mask_fn(BF(dav1d_mask, lasx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular,        lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp,  lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth,         lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp,   lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp,          lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular,  lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth,   lasx));
+
+static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
+#if BITDEPTH == 8
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+    c->avg = BF(dav1d_avg, lsx);
+    c->w_avg = BF(dav1d_w_avg, lsx);
+    c->mask = BF(dav1d_mask, lsx);
+    c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
+    c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
+    c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
+
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,         8tap_regular,        lsx);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH,  8tap_regular_smooth, lsx);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,   8tap_regular_sharp,  lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR,  8tap_smooth_regular, lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,          8tap_smooth,         lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,    8tap_smooth_sharp,   lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,   8tap_sharp_regular,  lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,    8tap_sharp_smooth,   lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,           8tap_sharp,          lsx);
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
+
+    c->avg = BF(dav1d_avg, lasx);
+    c->w_avg = BF(dav1d_w_avg, lasx);
+    c->mask = BF(dav1d_mask, lasx);
+    c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
+    c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
+    c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
+
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        lasx);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          lasx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_MC_H */
--- a/third_party/dav1d/src/loongarch/msac.S
+++ b/third_party/dav1d/src/loongarch/msac.S
@ -0,0 +1,368 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "loongson_asm.S"
+
+const min_prob
+  .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+endconst
+
+.macro decode_symbol_adapt w
+    addi.d          sp,      sp,     -48
+    addi.d          a4,      a0,      24
+    vldrepl.h       vr0,     a4,      0    //rng
+    fst.s           f0,      sp,      0    //val==0
+    vld             vr1,     a1,      0    //cdf
+.if \w == 16
+    li.w            t4,      16
+    vldx            vr11,    a1,      t4
+.endif
+    addi.d          a6,      a0,      16
+    vldrepl.d       vr2,     a6,      0    //dif
+    addi.d          t0,      a0,      32
+    ld.w            t1,      t0,      0    //allow_update_cdf
+    la.local        t2,      min_prob
+    addi.d          t2,      t2,      32
+    addi.w          t3,      a2,      1
+    slli.w          t3,      t3,      1
+    sub.d           t2,      t2,      t3
+    vld             vr3,     t2,      0    //min_prob
+.if \w == 16
+    vldx            vr13,    t2,      t4
+.endif
+    vsrli.h         vr4,     vr0,     8    //r = s->rng >> 8
+    vslli.h         vr4,     vr4,     8    //r << 8
+    vsrli.h         vr5,     vr1,     6
+    vslli.h         vr5,     vr5,     7
+.if \w == 16
+    vsrli.h         vr15,    vr11,    6
+    vslli.h         vr15,    vr15,    7
+.endif
+    vmuh.hu         vr5,     vr4,     vr5
+    vadd.h          vr5,     vr5,     vr3  //v
+.if \w == 16
+    vmuh.hu         vr15,    vr4,     vr15
+    vadd.h          vr15,    vr15,    vr13
+.endif
+    addi.d          t8,      sp,      4
+    vst             vr5,     t8,      0    //store v
+.if \w == 16
+    vstx            vr15,    t8,      t4
+.endif
+    vreplvei.h      vr20,    vr2,     3    //c
+    vssub.hu        vr6,     vr5,     vr20 //c >=v
+    vseqi.h         vr6,     vr6,     0
+.if \w == 16
+    vssub.hu        vr16,    vr15,    vr20 //c >=v
+    vseqi.h         vr16,    vr16,    0
+    vpickev.b       vr21,    vr16,    vr6
+.endif
+.if \w <= 8
+    vmskltz.h       vr10,    vr6
+.else
+    vmskltz.b       vr10,    vr21
+.endif
+    beqz            t1,      .renorm\()\w
+
+    // update_cdf
+    alsl.d          t1,      a2,      a1,   1
+    ld.h            t2,      t1,      0    //count
+    srli.w          t3,      t2,      4    //count >> 4
+    addi.w          t3,      t3,      4
+    li.w            t5,      2
+    sltu            t5,      t5,      a2
+    add.w           t3,      t3,      t5   //rate
+    sltui           t5,      t2,      32
+    add.w           t2,      t2,      t5   //count + (count < 32)
+    vreplgr2vr.h    vr9,     t3
+    vseq.h          vr7,     vr7,     vr7
+    vavgr.hu        vr5,     vr6,     vr7  //i >= val ? -1 : 32768
+    vsub.h          vr5,     vr5,     vr1
+    vsub.h          vr8,     vr1,     vr6
+.if \w == 16
+    vavgr.hu        vr15,    vr16,    vr7
+    vsub.h          vr15,    vr15,    vr11
+    vsub.h          vr18,    vr11,    vr16
+.endif
+    vsra.h          vr5,     vr5,     vr9
+    vadd.h          vr8,     vr8,     vr5
+.if \w == 4
+    fst.d           f8,      a1,      0
+.else
+    vst             vr8,     a1,      0
+.endif
+.if \w == 16
+    vsra.h          vr15,    vr15,    vr9
+    vadd.h          vr18,    vr18,    vr15
+    vstx            vr18,    a1,      t4
+.endif
+    st.h            t2,      t1,      0
+
+.renorm\()\w:
+    vpickve2gr.h    t3,      vr10,    0
+    ctz.w           a7,      t3            // ret
+    alsl.d          t3,      a7,      t8,      1
+    ld.hu           t4,      t3,      0    // v
+    addi.d          t3,      t3,      -2
+    ld.hu           t5,      t3,      0    // u
+    sub.w           t5,      t5,      t4   // rng
+    slli.d          t4,      t4,      48
+    vpickve2gr.d    t6,      vr2,     0
+    sub.d           t6,      t6,      t4   // dif
+    addi.d          t6,      t6,      1
+    clz.w           t4,      t5            // d
+    xori            t4,      t4,      16   // d
+    sll.d           t6,      t6,      t4
+    addi.d          t6,      t6,      -1   // dif
+    addi.d          a5,      a0,      28   // cnt
+    ld.w            t7,      a5,      0
+    sub.w           t7,      t7,      t4   // cnt-d
+    sll.w           t5,      t5,      t4
+    st.w            t5,      a4,      0    // store rng
+    bge             t7,      zero,    9f
+
+    // refill
+    ld.d            t0,      a0,      0    // buf_pos
+    addi.d          t1,      a0,      8
+    ld.d            t1,      t1,      0    // buf_end
+    addi.d          t2,      t0,      8
+    blt             t1,      t2,      1f
+
+    ld.d            t0,      t0,      0    // next_bits
+    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
+    addi.w          t7,      t7,      16   // cnt += 16
+    revb.d          t0,      t0            // next_bits = bswap(next_bits)
+    srli.w          t4,      t3,      3
+    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
+    st.d            t2,      a0,      0
+    andi            t3,      t3,      24   // shift_bits &= 24
+    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
+    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
+    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
+    li.w            t5,      48
+    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
+    xor             t6,      t6,      t0   // dif ^= next_bits
+    b               9f
+1:
+    li.w            t4,      40
+    sub.w           t5,      t4,      t7   // c = 40 - cnt
+2:
+    bge             t0,      t1,      3f
+    ld.bu           t2,      t0,      0
+    addi.d          t0,      t0,      1
+    sll.d           t2,      t2,      t5
+    xor             t6,      t6,      t2
+    addi.w          t5,      t5,      -8
+    bge             t5,      zero,    2b
+    // refill_eob_end
+3:
+    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
+    sub.w           t7,      t4,      t5   // cnt = 40 - c
+9:
+    st.w            t7,      a5,      0    // store cnt
+    st.d            t6,      a6,      0    // store dif
+    move            a0,      a7
+    addi.d          sp,      sp,      48
+.endm
+
+function msac_decode_symbol_adapt4_lsx
+    decode_symbol_adapt 4
+endfunc
+
+function msac_decode_symbol_adapt8_lsx
+    decode_symbol_adapt 8
+endfunc
+
+function msac_decode_symbol_adapt16_lsx
+    decode_symbol_adapt 16
+endfunc
+
+function msac_decode_bool_lsx
+    ld.w            t0,      a0,      24   // rng
+    srli.w          a1,      a1,      6
+    ld.d            t1,      a0,      16   // dif
+    srli.w          t2,      t0,      8    // r >> 8
+    mul.w           t2,      t2,      a1
+    ld.w            a5,      a0,      28   // cnt
+    addi.d          t1,      t1,      1    // dif + 1
+    srli.w          t2,      t2,      1
+    addi.w          t2,      t2,      4    // v
+    slli.d          t3,      t2,      48   // vw
+    sltu            t4,      t1,      t3
+    move            t8,      t4            // ret
+    xori            t4,      t4,      1
+    maskeqz         t6,      t3,      t4   // if (ret) vw
+    sub.d           t6,      t1,      t6   // dif
+    slli.w          t5,      t2,      1
+    sub.w           t5,      t0,      t5   // r - 2v
+    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
+    add.w           t5,      t2,      t7   // v(rng)
+
+    // renorm
+    clz.w           t4,      t5            // d
+    xori            t4,      t4,      16   // d
+    sll.d           t6,      t6,      t4
+    addi.d          t6,      t6,      -1   // dif
+    sub.w           t7,      a5,      t4   // cnt-d
+    sll.w           t5,      t5,      t4
+    st.w            t5,      a0,      24   // store rng
+    bge             t7,      zero,    9f
+
+    // refill
+    ld.d            t0,      a0,      0    // buf_pos
+    addi.d          t1,      a0,      8
+    ld.d            t1,      t1,      0    // buf_end
+    addi.d          t2,      t0,      8
+    blt             t1,      t2,      1f
+
+    ld.d            t0,      t0,      0    // next_bits
+    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
+    addi.w          t7,      t7,      16   // cnt += 16
+    revb.d          t0,      t0            // next_bits = bswap(next_bits)
+    srli.w          t4,      t3,      3
+    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
+    st.d            t2,      a0,      0
+    andi            t3,      t3,      24   // shift_bits &= 24
+    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
+    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
+    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
+    li.w            t5,      48
+    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
+    xor             t6,      t6,      t0   // dif ^= next_bits
+    b               9f
+1:
+    li.w            t4,      40
+    sub.w           t5,      t4,      t7   // c = 40 - cnt
+2:
+    bge             t0,      t1,      3f
+    ld.bu           t2,      t0,      0
+    addi.d          t0,      t0,      1
+    sll.d           t2,      t2,      t5
+    xor             t6,      t6,      t2
+    addi.w          t5,      t5,      -8
+    bge             t5,      zero,    2b
+    // refill_eob_end
+3:
+    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
+    sub.w           t7,      t4,      t5   // cnt = 40 - c
+9:
+    st.w            t7,      a0,      28   // store cnt
+    st.d            t6,      a0,      16   // store dif
+    move            a0,      t8
+endfunc
+
+function msac_decode_bool_adapt_lsx
+    ld.hu           a3,      a1,      0    // cdf[0] /f
+    ld.w            t0,      a0,      24   // rng
+    ld.d            t1,      a0,      16   // dif
+    srli.w          t2,      t0,      8    // r >> 8
+    srli.w          a7,      a3,      6
+    mul.w           t2,      t2,      a7
+    ld.w            a4,      a0,      32   // allow_update_cdf
+    ld.w            a5,      a0,      28   // cnt
+    srli.w          t2,      t2,      1
+    addi.w          t2,      t2,      4    // v
+    slli.d          t3,      t2,      48   // vw
+    sltu            t4,      t1,      t3
+    move            t8,      t4            // bit
+    xori            t4,      t4,      1
+    maskeqz         t6,      t3,      t4   // if (ret) vw
+    sub.d           t6,      t1,      t6   // dif
+    slli.w          t5,      t2,      1
+    sub.w           t5,      t0,      t5   // r - 2v
+    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
+    add.w           t5,      t2,      t7   // v(rng)
+    beqz            a4,      .renorm
+
+    // update_cdf
+    ld.hu           t0,      a1,      2    // cdf[1]
+    srli.w          t1,      t0,      4
+    addi.w          t1,      t1,      4    // rate
+    sltui           t2,      t0,      32   // count < 32
+    add.w           t0,      t0,      t2   // count + (count < 32)
+    sub.w           a3,      a3,      t8   // cdf[0] -= bit
+    slli.w          t4,      t8,      15
+    sub.w           t7,      a3,      t4   // cdf[0] - bit - 32768
+    sra.w           t7,      t7,      t1   // (cdf[0] - bit - 32768) >> rate
+    sub.w           t7,      a3,      t7   // cdf[0]
+    st.h            t7,      a1,      0
+    st.h            t0,      a1,      2
+
+.renorm:
+    // renorm
+    addi.d          t6,      t6,      1
+    clz.w           t4,      t5            // d
+    xori            t4,      t4,      16   // d
+    sll.d           t6,      t6,      t4
+    addi.d          t6,      t6,      -1   // dif
+    sub.w           t7,      a5,      t4   // cnt-d
+    sll.w           t5,      t5,      t4
+    st.w            t5,      a0,      24   // store rng
+    bge             t7,      zero,    9f
+
+    // refill
+    ld.d            t0,      a0,      0    // buf_pos
+    addi.d          t1,      a0,      8
+    ld.d            t1,      t1,      0    // buf_end
+    addi.d          t2,      t0,      8
+    blt             t1,      t2,      1f
+
+    ld.d            t0,      t0,      0    // next_bits
+    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
+    addi.w          t7,      t7,      16   // cnt += 16
+    revb.d          t0,      t0            // next_bits = bswap(next_bits)
+    srli.w          t4,      t3,      3
+    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
+    st.d            t2,      a0,      0
+    andi            t3,      t3,      24   // shift_bits &= 24
+    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
+    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
+    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
+    li.w            t5,      48
+    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
+    xor             t6,      t6,      t0   // dif ^= next_bits
+    b               9f
+1:
+    li.w            t4,      40
+    sub.w           t5,      t4,      t7   // c = 40 - cnt
+2:
+    bge             t0,      t1,      3f
+    ld.bu           t2,      t0,      0
+    addi.d          t0,      t0,      1
+    sll.d           t2,      t2,      t5
+    xor             t6,      t6,      t2
+    addi.w          t5,      t5,      -8
+    bge             t5,      zero,    2b
+    // refill_eob_end
+3:
+    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
+    sub.w           t7,      t4,      t5   // cnt = 40 - c
+9:
+    st.w            t7,      a0,      28   // store cnt
+    st.d            t6,      a0,      16   // store dif
+    move            a0,      t8
+endfunc
--- a/third_party/dav1d/src/loongarch/msac.h
+++ b/third_party/dav1d/src/loongarch/msac.h
@ -0,0 +1,46 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MSAC_H
+#define DAV1D_SRC_LOONGARCH_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
+
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_lsx
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_lsx
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_lsx
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_lsx
+
+#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */
--- a/third_party/dav1d/src/loongarch/refmvs.S
+++ b/third_party/dav1d/src/loongarch/refmvs.S
@ -0,0 +1,152 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
+                       const int bx4, const int bw4, int bh4)
+*/
+
+function splat_mv_lsx
+    vld           vr0,      a1,       0          // 0 1 ... 11 ...
+    clz.w         t4,       a3
+    vaddi.bu      vr1,      vr0,      0
+    addi.w        t4,       t4,       -26
+    vextrins.w    vr1,      vr0,      0x30       // 0 1 2 ... 11 0 1 2 3
+    la.local      t5,       .SPLAT_LSX_JRTABLE
+    vbsrl.v       vr2,      vr1,      4          // 4 5 6 7...11 0 1 2 3 0 0 0 0
+    alsl.d        t6,       t4,       t5,     1
+    vextrins.w    vr2,      vr0,      0x31       // 4 5 6 7...11 0 1 2 3 4 5 6 7
+    ld.h          t7,       t6,       0
+    vbsrl.v       vr3,      vr2,      4          // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
+    add.d         t8,       t5,       t7
+    alsl.d        a2,       a2,       a2,     1
+    vextrins.w    vr3,      vr0,      0x32       // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
+    slli.w        a2,       a2,       2
+    jirl          $r0,      t8,       0
+
+.SPLAT_LSX_JRTABLE:
+    .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
+    .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
+    .hword .SPLAT_W8_LSX  - .SPLAT_LSX_JRTABLE
+    .hword .SPLAT_W4_LSX  - .SPLAT_LSX_JRTABLE
+    .hword .SPLAT_W2_LSX  - .SPLAT_LSX_JRTABLE
+    .hword .SPLAT_W1_LSX  - .SPLAT_LSX_JRTABLE
+
+.SPLAT_W1_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+    fst.d         f1,       t3,       0
+    fst.s         f3,       t3,       8
+    blt           zero,     a4,       .SPLAT_W1_LSX
+    b             .splat_end
+.SPLAT_W2_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+    vst           vr1,      t3,       0
+    fst.d         f2,       t3,       16
+    blt           zero,     a4,       .SPLAT_W2_LSX
+    b             .splat_end
+
+.SPLAT_W4_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+    vst           vr1,      t3,       0
+    vst           vr2,      t3,       16
+    vst           vr3,      t3,       32
+    blt           zero,     a4,       .SPLAT_W4_LSX
+    b             .splat_end
+
+.SPLAT_W8_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+    vst           vr1,      t3,       0
+    vst           vr2,      t3,       16
+    vst           vr3,      t3,       32
+
+    vst           vr1,      t3,       48
+    vst           vr2,      t3,       64
+    vst           vr3,      t3,       80
+    blt           zero,     a4,       .SPLAT_W8_LSX
+    b             .splat_end
+
+.SPLAT_W16_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+.rept 2
+    vst           vr1,      t3,       0
+    vst           vr2,      t3,       16
+    vst           vr3,      t3,       32
+
+    vst           vr1,      t3,       48
+    vst           vr2,      t3,       64
+    vst           vr3,      t3,       80
+
+    addi.d        t3,       t3,       96
+.endr
+
+    blt           zero,     a4,       .SPLAT_W16_LSX
+    b             .splat_end
+
+.SPLAT_W32_LSX:
+    ld.d          t3,       a0,       0
+    addi.d        a0,       a0,       8
+    addi.d        a4,       a4,       -1
+    add.d         t3,       t3,       a2
+
+.rept 4
+    vst           vr1,      t3,       0
+    vst           vr2,      t3,       16
+    vst           vr3,      t3,       32
+
+    vst           vr1,      t3,       48
+    vst           vr2,      t3,       64
+    vst           vr3,      t3,       80
+
+    addi.d        t3,       t3,       96
+.endr
+
+    blt           zero,     a4,       .SPLAT_W32_LSX
+
+.splat_end:
+endfunc
--- a/third_party/dav1d/src/loongarch/refmvs.h
+++ b/third_party/dav1d/src/loongarch/refmvs.h
@ -0,0 +1,44 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H
+#define DAV1D_SRC_LOONGARCH_REFMVS_H
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_splat_mv_fn(dav1d_splat_mv_lsx);
+
+static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+    c->splat_mv = dav1d_splat_mv_lsx;
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */
--- a/third_party/dav1d/src/loopfilter_tmpl.c
+++ b/third_party/dav1d/src/loopfilter_tmpl.c
@ -247,6 +247,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/loopfilter.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/loopfilter.h"
 #elif ARCH_X86
 #include "src/x86/loopfilter.h"
 #endif
@ -261,6 +263,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    loop_filter_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+    loop_filter_dsp_init_loongarch(c);
 #elif ARCH_X86
    loop_filter_dsp_init_x86(c);
 #endif
--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -527,6 +527,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/looprestoration.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/looprestoration.h"
 #elif ARCH_PPC64LE
 #include "src/ppc/looprestoration.h"
 #elif ARCH_X86
@ -545,6 +547,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    loop_restoration_dsp_init_arm(c, bpc);
+#elif ARCH_LOONGARCH64
+    loop_restoration_dsp_init_loongarch(c, bpc);
 #elif ARCH_PPC64LE
    loop_restoration_dsp_init_ppc(c, bpc);
 #elif ARCH_X86
--- a/third_party/dav1d/src/mc_tmpl.c
+++ b/third_party/dav1d/src/mc_tmpl.c
@ -905,6 +905,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/mc.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/mc.h"
 #elif ARCH_X86
 #include "src/x86/mc.h"
 #endif
@ -946,6 +948,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    mc_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+    mc_dsp_init_loongarch(c);
 #elif ARCH_X86
    mc_dsp_init_x86(c);
 #endif
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -226,6 +226,24 @@ if is_asm_enabled

        # Compile the ASM sources with NASM
        libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
+    elif host_machine.cpu_family().startswith('loongarch')
+        libdav1d_sources += files(
+            'loongarch/cpu.c',
+        )
+
+        libdav1d_arch_tmpl_sources += files(
+            'loongarch/looprestoration_tmpl.c',
+        )
+
+        libdav1d_sources_asm = files(
+            'loongarch/mc.S',
+            'loongarch/loopfilter.S',
+            'loongarch/looprestoration.S',
+            'loongarch/msac.S',
+            'loongarch/refmvs.S',
+            'loongarch/itx.S',
+        )
+        libdav1d_asm_objs += libdav1d_sources_asm
    elif host_machine.cpu() == 'ppc64le'
        arch_flags = ['-maltivec', '-mvsx']
        libdav1d_sources += files(
@ -235,6 +253,15 @@ if is_asm_enabled
            'ppc/cdef_tmpl.c',
            'ppc/looprestoration_tmpl.c',
        )
+    elif host_machine.cpu_family().startswith('riscv')
+        libdav1d_sources += files(
+            'riscv/cpu.c',
+        )
+        if host_machine.cpu_family() == 'riscv64'
+            libdav1d_sources += files(
+                'riscv/64/itx.S',
+            )
+        endif
    endif
 endif

--- a/third_party/dav1d/src/msac.h
+++ b/third_party/dav1d/src/msac.h
@ -51,6 +51,8 @@ typedef struct MsacContext {
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/msac.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/msac.h"
 #elif ARCH_X86
 #include "src/x86/msac.h"
 #endif
--- a/third_party/dav1d/src/qm.c
+++ b/third_party/dav1d/src/qm.c
--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@ -919,6 +919,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/refmvs.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/refmvs.h"
 #elif ARCH_X86
 #include "src/x86/refmvs.h"
 #endif
@ -933,6 +935,8 @@ COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    refmvs_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+    refmvs_dsp_init_loongarch(c);
 #elif ARCH_X86
    refmvs_dsp_init_x86(c);
 #endif
--- a/third_party/dav1d/src/refmvs.h
+++ b/third_party/dav1d/src/refmvs.h
@ -171,6 +171,7 @@ void dav1d_refmvs_find(const refmvs_tile *rt,

 void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
 void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp);
 void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);

 #endif /* DAV1D_SRC_REF_MVS_H */
--- a/third_party/dav1d/src/riscv/64/itx.S
+++ b/third_party/dav1d/src/riscv/64/itx.S
@ -0,0 +1,662 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/riscv/asm.S"
+
+function inv_txfm_add_4x4_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 4, e16, mf2, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 8
+  vle16.v v1, (t0)
+  addi t0, t0, 8
+  vle16.v v2, (t0)
+  addi t0, t0, 8
+  vle16.v v3, (t0)
+
+  jalr t0, a4
+
+  vmv.v.x v4, zero
+
+  vsseg4e16.v v0, (a2)
+  vle16.v v0, (a2)
+  vse16.v v4, (a2)
+  addi t0, a2, 8
+  vle16.v v1, (t0)
+  vse16.v v4, (t0)
+  addi t0, t0, 8
+  vle16.v v2, (t0)
+  vse16.v v4, (t0)
+  addi t0, t0, 8
+  vle16.v v3, (t0)
+  vse16.v v4, (t0)
+
+  jalr t0, a5
+
+  vssra.vi v0, v0, 4
+  vssra.vi v1, v1, 4
+  vssra.vi v2, v2, 4
+  vssra.vi v3, v3, 4
+
+itx_4x4_end:
+  vsetvli zero, zero, e8, mf4, ta, ma
+  vle8.v v4, (a0)
+  add t0, a0, a1
+  vle8.v v5, (t0)
+  add t0, t0, a1
+  vle8.v v6, (t0)
+  add t0, t0, a1
+  vle8.v v7, (t0)
+
+  vwaddu.wv v0, v0, v4
+  vwaddu.wv v1, v1, v5
+  vwaddu.wv v2, v2, v6
+  vwaddu.wv v3, v3, v7
+
+  vsetvli zero, zero, e16, mf2, ta, ma
+  vmax.vx v0, v0, zero
+  vmax.vx v1, v1, zero
+  vmax.vx v2, v2, zero
+  vmax.vx v3, v3, zero
+
+  vsetvli zero, zero, e8, mf4, ta, ma
+
+  vnclipu.wi v4, v0, 0
+  vnclipu.wi v5, v1, 0
+  vnclipu.wi v6, v2, 0
+  vnclipu.wi v7, v3, 0
+
+  vse8.v v4, (a0)
+  add a0, a0, a1
+  vse8.v v5, (a0)
+  add a0, a0, a1
+  vse8.v v6, (a0)
+  add a0, a0, a1
+  vse8.v v7, (a0)
+
+  ret
+endfunc
+
+function inv_identity_e16_x4_rvv, export=1, ext=v
+  li t1, (5793-4096)*8
+  vsmul.vx v4, v0, t1
+  vsmul.vx v5, v1, t1
+  vsmul.vx v6, v2, t1
+  vsmul.vx v7, v3, t1
+
+  vsadd.vv v0, v0, v4
+  vsadd.vv v1, v1, v5
+  vsadd.vv v2, v2, v6
+  vsadd.vv v3, v3, v7
+
+  jr t0
+endfunc
+
+.macro idct_4 o0, o1, o2, o3
+  li t1, 2896
+  li t2, 1567
+  li t3, 3784
+
+  vwmul.vx v8, \o0, t1
+  vwmul.vx v10, \o0, t1
+  vwmacc.vx v8, t1, \o2
+  neg t1, t1
+  vwmacc.vx v10, t1, \o2
+
+  vwmul.vx v12, \o1, t3
+  neg t3, t3
+  vwmul.vx v14, \o1, t2
+  vwmacc.vx v12, t2, \o3
+  vwmacc.vx v14, t3, \o3
+
+  li t1, 2048
+
+  vwadd.wx v8, v8, t1
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+  vwadd.wx v14, v14, t1
+
+  vnsra.wi v8, v8, 12
+  vnsra.wi v10, v10, 12
+  vnsra.wi v12, v12, 12
+  vnsra.wi v14, v14, 12
+
+  vsadd.vv \o0, v8, v12
+  vsadd.vv \o1, v10, v14
+  vssub.vv \o2, v10, v14
+  vssub.vv \o3, v8, v12
+.endm
+
+.macro iadst_4 o0, o1, o2, o3
+  li t1, 1321
+  li t2, 3803
+  li t3, 2482
+
+  vwmul.vx v4, v0, t1
+  vwmul.vx v5, v0, t3
+  neg t1, t1
+  vwmacc.vx v4, t2, v2
+  vwmacc.vx v5, t1, v2
+  neg t2, t2
+  vwmacc.vx v4, t3, v3
+  vwmacc.vx v5, t2, v3
+
+  vwsub.vv v6, v0, v2
+  vwadd.wv v6, v6, v3
+
+  li t1, 3344
+  vwmul.vx v7, v1, t1
+
+  vsetvli zero, zero, e32, m1, ta, ma
+
+  vmul.vx v6, v6, t1
+
+  vadd.vv v8, v4, v5
+  vadd.vv v4, v4, v7
+  vadd.vv v5, v5, v7
+  vsub.vv v7, v8, v7
+
+  li t1, 2048
+
+  vadd.vx v4, v4, t1
+  vadd.vx v5, v5, t1
+  vadd.vx v6, v6, t1
+  vadd.vx v7, v7, t1
+
+  vsetvli zero, zero, e16, mf2, ta, ma
+
+  vnsra.wi \o0, v4, 12
+  vnsra.wi \o1, v5, 12
+  vnsra.wi \o2, v6, 12
+  vnsra.wi \o3, v7, 12
+.endm
+
+function inv_dct_e16_x4_rvv, export=1, ext=v
+  idct_4 v0, v1, v2, v3
+  jr t0
+endfunc
+
+function inv_adst_e16_x4_rvv, export=1, ext=v
+  iadst_4 v0, v1, v2, v3
+  jr t0
+endfunc
+
+function inv_flipadst_e16_x4_rvv, export=1, ext=v
+  iadst_4 v3, v2, v1, v0
+  jr t0
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+  beqz a3, 1f
+.endif
+  la a4, inv_\txfm1\()_e16_x4_rvv
+  la a5, inv_\txfm2\()_e16_x4_rvv
+  j inv_txfm_add_4x4_rvv
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+  csrw vxrm, zero
+  vsetivli zero, 4, e16, mf2, ta, ma
+  ld t2, (a2)
+  li t1, 2896*8
+  vmv.v.x v0, t2
+  vsmul.vx v0, v0, t1
+  sd x0, (a2)
+  vsmul.vx v0, v0, t1
+  vssra.vi v0, v0, 4
+  vmv.v.v v1, v0
+  vmv.v.v v2, v0
+  vmv.v.v v3, v0
+  j itx_4x4_end
+.endif
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 8, e16, m1, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 16
+  vle16.v v1, (t0)
+  addi t0, t0, 16
+  vle16.v v2, (t0)
+  addi t0, t0, 16
+  vle16.v v3, (t0)
+  addi t0, t0, 16
+  vle16.v v4, (t0)
+  addi t0, t0, 16
+  vle16.v v5, (t0)
+  addi t0, t0, 16
+  vle16.v v6, (t0)
+  addi t0, t0, 16
+  vle16.v v7, (t0)
+
+.ifc \variant, identity_
+  // The identity vsadd.vv and downshift vssra.vi 1 cancel out
+.else
+  jalr t0, a4
+
+  vssra.vi v0, v0, 1
+  vssra.vi v1, v1, 1
+  vssra.vi v2, v2, 1
+  vssra.vi v3, v3, 1
+  vssra.vi v4, v4, 1
+  vssra.vi v5, v5, 1
+  vssra.vi v6, v6, 1
+  vssra.vi v7, v7, 1
+.endif
+
+  vsseg8e16.v v0, (a2)
+  vle16.v v0, (a2)
+  addi t0, a2, 16
+  vle16.v v1, (t0)
+  addi t0, t0, 16
+  vle16.v v2, (t0)
+  addi t0, t0, 16
+  vle16.v v3, (t0)
+  addi t0, t0, 16
+  vle16.v v4, (t0)
+  addi t0, t0, 16
+  vle16.v v5, (t0)
+  addi t0, t0, 16
+  vle16.v v6, (t0)
+  addi t0, t0, 16
+  vle16.v v7, (t0)
+
+  jalr t0, a5
+
+  vssra.vi v0, v0, 4
+  vssra.vi v1, v1, 4
+  vssra.vi v2, v2, 4
+  vssra.vi v3, v3, 4
+  vssra.vi v4, v4, 4
+  vssra.vi v5, v5, 4
+  vssra.vi v6, v6, 4
+  vssra.vi v7, v7, 4
+
+  li t1, 64
+  vsetvli zero, t1, e16, m8, ta, ma
+  vmv.v.x v8, zero
+  vse16.v v8, (a2)
+
+.ifc \variant, identity_
+itx_8x8_end:
+.endif
+  vsetivli zero, 8, e8, mf2, ta, ma
+  vle8.v v8, (a0)
+  add t0, a0, a1
+  vle8.v v9, (t0)
+  add t0, t0, a1
+  vle8.v v10, (t0)
+  add t0, t0, a1
+  vle8.v v11, (t0)
+  add t0, t0, a1
+  vle8.v v12, (t0)
+  add t0, t0, a1
+  vle8.v v13, (t0)
+  add t0, t0, a1
+  vle8.v v14, (t0)
+  add t0, t0, a1
+  vle8.v v15, (t0)
+
+  vwaddu.wv v0, v0, v8
+  vwaddu.wv v1, v1, v9
+  vwaddu.wv v2, v2, v10
+  vwaddu.wv v3, v3, v11
+  vwaddu.wv v4, v4, v12
+  vwaddu.wv v5, v5, v13
+  vwaddu.wv v6, v6, v14
+  vwaddu.wv v7, v7, v15
+
+  vsetvli zero, zero, e16, m1
+  vmax.vx v0, v0, zero
+  vmax.vx v1, v1, zero
+  vmax.vx v2, v2, zero
+  vmax.vx v3, v3, zero
+  vmax.vx v4, v4, zero
+  vmax.vx v5, v5, zero
+  vmax.vx v6, v6, zero
+  vmax.vx v7, v7, zero
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+
+  vnclipu.wi v8, v0, 0
+  vnclipu.wi v9, v1, 0
+  vnclipu.wi v10, v2, 0
+  vnclipu.wi v11, v3, 0
+  vnclipu.wi v12, v4, 0
+  vnclipu.wi v13, v5, 0
+  vnclipu.wi v14, v6, 0
+  vnclipu.wi v15, v7, 0
+
+  vse8.v v8, (a0)
+  add a0, a0, a1
+  vse8.v v9, (a0)
+  add a0, a0, a1
+  vse8.v v10, (a0)
+  add a0, a0, a1
+  vse8.v v11, (a0)
+  add a0, a0, a1
+  vse8.v v12, (a0)
+  add a0, a0, a1
+  vse8.v v13, (a0)
+  add a0, a0, a1
+  vse8.v v14, (a0)
+  add a0, a0, a1
+  vse8.v v15, (a0)
+
+  ret
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+function inv_identity_e16_x8_rvv, export=1, ext=v
+  vsadd.vv v0, v0, v0
+  vsadd.vv v1, v1, v1
+  vsadd.vv v2, v2, v2
+  vsadd.vv v3, v3, v3
+  vsadd.vv v4, v4, v4
+  vsadd.vv v5, v5, v5
+  vsadd.vv v6, v6, v6
+  vsadd.vv v7, v7, v7
+
+  jr t0
+endfunc
+
+function inv_dct_e16_x8_rvv, export=1, ext=v
+  idct_4 v0, v2, v4, v6
+
+  li t1, 799
+  li t2, 4017
+  li t3, 3406
+  li t4, 2276
+
+  vwmul.vx v14, v1, t2
+  neg t2, t2
+  vwmul.vx v8, v1, t1
+  vwmacc.vx v14, t1, v7
+  vwmacc.vx v8, t2, v7
+
+  vwmul.vx v12, v5, t4
+  neg t4, t4
+  vwmul.vx v10, v5, t3
+  vwmacc.vx v12, t3, v3
+  vwmacc.vx v10, t4, v3
+
+  li t1, 2048
+
+  vwadd.wx v8, v8, t1
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+  vwadd.wx v14, v14, t1
+
+  vnsra.wi v8, v8, 12
+  vnsra.wi v10, v10, 12
+  vnsra.wi v12, v12, 12
+  vnsra.wi v14, v14, 12
+
+  vssub.vv v7, v14, v12
+  vsadd.vv v14, v14, v12
+  vssub.vv v1, v8, v10
+  vsadd.vv v8, v8, v10
+
+  li t2, 2896
+
+  vwmul.vx v10, v7, t2
+  vwmul.vx v12, v7, t2
+  vwmacc.vx v12, t2, v1
+  neg t2, t2
+  vwmacc.vx v10, t2, v1
+
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+
+  vnsra.wi v10, v10, 12
+  vnsra.wi v12, v12, 12
+
+  vssub.vv v7, v0, v14
+  vsadd.vv v0, v0, v14
+  vssub.vv v9, v2, v12
+  vsadd.vv v1, v2, v12
+  vssub.vv v5, v4, v10
+  vsadd.vv v2, v4, v10
+  vssub.vv v4, v6, v8
+  vsadd.vv v3, v6, v8
+  vmv.v.v v6, v9
+
+  jr t0
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+  li t1, 4076
+  li t2, 401
+  li t3, 3612
+  li t4, 1931
+  li t5, 2598
+  li t6, 3166
+
+  vwmul.vx v8, v7, t1
+  neg t1, t1
+  vwmul.vx v10, v7, t2
+  vwmacc.vx v8, t2, v0
+  vwmacc.vx v10, t1, v0
+
+  vwmul.vx v12, v5, t3
+  neg t3, t3
+  vwmul.vx v14, v5, t4
+  vwmacc.vx v12, t4, v2
+  vwmacc.vx v14, t3, v2
+
+  vwmul.vx v16, v3, t5
+  neg t5, t5
+  vwmul.vx v18, v3, t6
+  vwmacc.vx v16, t6, v4
+  vwmacc.vx v18, t5, v4
+
+  li t1, 2048
+  li t2, 1189
+  li t3, 3920
+  li t4, 1567
+  li t5, 3784
+  li t6, 2896
+
+  vwmul.vx v20, v1, t2
+  neg t2, t2
+  vwmul.vx v22, v1, t3
+  vwmacc.vx v20, t3, v6
+  vwmacc.vx v22, t2, v6
+
+  vwadd.wx v8, v8, t1
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+  vwadd.wx v14, v14, t1
+  vwadd.wx v16, v16, t1
+  vwadd.wx v18, v18, t1
+  vwadd.wx v20, v20, t1
+  vwadd.wx v22, v22, t1
+
+  vnsra.wi v8, v8, 12
+  vnsra.wi v10, v10, 12
+  vnsra.wi v12, v12, 12
+  vnsra.wi v14, v14, 12
+  vnsra.wi v16, v16, 12
+  vnsra.wi v18, v18, 12
+  vnsra.wi v20, v20, 12
+  vnsra.wi v22, v22, 12
+
+  vssub.vv v4, v8, v16
+  vsadd.vv v8, v8, v16
+  vsadd.vv v1, v10, v18
+  vsadd.vv v2, v12, v20
+  vsadd.vv v3, v14, v22
+  vssub.vv v5, v10, v18
+  vssub.vv v6, v12, v20
+  vssub.vv v22, v14, v22
+
+  vsadd.vv \o0, v8, v2
+  vsadd.vv \o7, v1, v3
+  vssub.vv v2, v8, v2
+  vssub.vv v3, v1, v3
+
+  vwmul.vx v8, v4, t5
+  vwmul.vx v10, v4, t4
+  vwmul.vx v12, v22, t5
+  vwmul.vx v14, v22, t4
+  vwmacc.vx v8, t4, v5
+  neg t4, t4
+  vwmacc.vx v14, t5, v6
+  neg t5, t5
+  vwmacc.vx v12, t4, v6
+  vwmacc.vx v10, t5, v5
+
+  vwadd.wx v8, v8, t1
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+  vwadd.wx v14, v14, t1
+
+  vnsra.wi v8, v8, 12
+  vnsra.wi v10, v10, 12
+  vnsra.wi v12, v12, 12
+  vnsra.wi v14, v14, 12
+
+  vsadd.vv \o1, v8, v12
+  vsadd.vv \o6, v10, v14
+  vssub.vv v8, v8, v12
+  vssub.vv v9, v10, v14
+
+  vwmul.vx v10, v2, t6
+  vwmul.vx v12, v2, t6
+  vwmul.vx v14, v8, t6
+  vwmul.vx v16, v8, t6
+  vwmacc.vx v10, t6, v3
+  vwmacc.vx v14, t6, v9
+  neg t6, t6
+  vwmacc.vx v12, t6, v3
+  vwmacc.vx v16, t6, v9
+
+  vwadd.wx v10, v10, t1
+  vwadd.wx v12, v12, t1
+  vwadd.wx v14, v14, t1
+  vwadd.wx v16, v16, t1
+
+  vnsra.wi \o3, v10, 12
+  vnsra.wi \o4, v12, 12
+  vnsra.wi \o2, v14, 12
+  vnsra.wi \o5, v16, 12
+
+  vmv.v.x v8, zero
+  vssub.vv \o1, v8, \o1
+  vssub.vv \o3, v8, \o3
+  vssub.vv \o5, v8, \o5
+  vssub.vv \o7, v8, \o7
+.endm
+
+function inv_adst_e16_x8_rvv, export=1, ext=v
+  iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
+  jr t0
+endfunc
+
+function inv_flipadst_e16_x8_rvv, export=1, ext=v
+  iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
+  jr t0
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+  beqz a3, 1f
+.endif
+  la a5, inv_\txfm2\()_e16_x8_rvv
+.ifc \txfm1, identity
+  j inv_txfm_identity_add_8x8_rvv
+.else
+  la a4, inv_\txfm1\()_e16_x8_rvv
+  j inv_txfm_add_8x8_rvv
+.endif
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+  csrw vxrm, zero
+  vsetivli zero, 8, e16, m1, ta, ma
+  ld t2, (a2)
+  li t1, 2896*8
+  vmv.v.x v0, t2
+  vsmul.vx v0, v0, t1
+  sd x0, (a2)
+  vssra.vi v0, v0, 1
+  vsmul.vx v0, v0, t1
+  vssra.vi v0, v0, 4
+  vmv.v.v v1, v0
+  vmv.v.v v2, v0
+  vmv.v.v v3, v0
+  vmv.v.v v4, v0
+  vmv.v.v v5, v0
+  vmv.v.v v6, v0
+  vmv.v.v v7, v0
+  j itx_8x8_end
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
--- a/third_party/dav1d/src/riscv/asm.S
+++ b/third_party/dav1d/src/riscv/asm.S
@ -0,0 +1,126 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_ASM_S
+#define DAV1D_SRC_RISCV_ASM_S
+
+#include "config.h"
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, ext=
+    .macro endfunc
+#ifdef __ELF__
+        .size   \name, . - \name
+#endif
+        .option pop
+        .purgem endfunc
+    .endm
+        .text
+        .option push
+    .ifnb \ext
+        .option arch, +\ext
+    .endif
+    .if \export
+        .global EXTERN\name
+#ifdef __ELF__
+        .type   EXTERN\name, %function
+        .hidden EXTERN\name
+#elif defined(__MACH__)
+        .private_extern EXTERN\name
+#endif
+EXTERN\name:
+    .else
+#ifdef __ELF__
+        .type \name, %function
+#endif
+    .endif
+\name:
+.endm
+
+.macro  const   name, export=0, align=2
+    .macro endconst
+#ifdef __ELF__
+        .size   \name, . - \name
+#endif
+        .purgem endconst
+    .endm
+#if defined(_WIN32)
+        .section        .rdata
+#elif !defined(__MACH__)
+        .section        .rodata
+#else
+        .const_data
+#endif
+        .align          \align
+    .if \export
+        .global EXTERN\name
+#ifdef __ELF__
+        .hidden EXTERN\name
+#elif defined(__MACH__)
+        .private_extern EXTERN\name
+#endif
+EXTERN\name:
+    .endif
+\name:
+.endm
+
+.macro thread_local name, align=3, quads=1
+    .macro end_thread_local
+        .size  \name, . - \name
+        .purgem end_thread_local
+    .endm
+        .section .tbss, "waT"
+        .align \align
+        .hidden \name
+\name:
+    .rept \quads
+        .quad 0
+    .endr
+        end_thread_local
+.endm
+
+#endif /* DAV1D_SRC_RISCV_ASM_S */
--- a/third_party/dav1d/src/riscv/cpu.c
+++ b/third_party/dav1d/src/riscv/cpu.c
@ -0,0 +1,49 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/riscv/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define HWCAP_RVV (1 << ('v' - 'a'))
+
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_riscv(void) {
+    unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
+#endif
+
+    return flags;
+}
--- a/third_party/dav1d/src/riscv/cpu.h
+++ b/third_party/dav1d/src/riscv/cpu.h
@ -0,0 +1,37 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_CPU_H
+#define DAV1D_SRC_RISCV_CPU_H
+
+enum CpuFlags {
+    DAV1D_RISCV_CPU_FLAG_V = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_riscv(void);
+
+#endif /* DAV1D_SRC_RISCV_CPU_H */
--- a/third_party/dav1d/src/riscv/itx.h
+++ b/third_party/dav1d/src/riscv/itx.h
@ -0,0 +1,109 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4,  4, ext); \
+decl_itx16_fns( 8,  8, ext)
+
+decl_itx_fns(rvv);
+
+static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+    assign_itx1_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+    assign_itx2_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
+    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
+    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
+    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
+    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
+    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
+    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+    assign_itx12_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
+    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+    assign_itx16_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
+
+  const unsigned flags = dav1d_get_cpu_flags();
+
+  if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
+
+#if BITDEPTH == 8
+  assign_itx16_fn( ,  4,  4, rvv);
+  assign_itx16_fn( ,  8,  8, rvv);
+#endif
+}
--- a/third_party/dav1d/src/x86/ipred.h
+++ b/third_party/dav1d/src/x86/ipred.h
@ -138,13 +138,13 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
    init_angular_ipred_fn(HOR_PRED,      ipred_h,        avx512icl);
    init_angular_ipred_fn(VERT_PRED,     ipred_v,        avx512icl);
    init_angular_ipred_fn(Z2_PRED,       ipred_z2,       avx512icl);
-    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       avx512icl);
 #endif
    init_angular_ipred_fn(PAETH_PRED,    ipred_paeth,    avx512icl);
    init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   avx512icl);
    init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
    init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
    init_angular_ipred_fn(Z1_PRED,       ipred_z1,       avx512icl);
+    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       avx512icl);
    init_angular_ipred_fn(FILTER_PRED,   ipred_filter,   avx512icl);

    c->pal_pred = BF(dav1d_pal_pred, avx512icl);
--- a/third_party/dav1d/src/x86/ipred16_avx512.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@ -1,5 +1,5 @@
-; Copyright © 2022, VideoLAN and dav1d authors
-; Copyright © 2022, Two Orioles, LLC
+; Copyright © 2022-2024, VideoLAN and dav1d authors
+; Copyright © 2022-2024, Two Orioles, LLC
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
@ -42,12 +42,16 @@ pal_pred_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
               db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
               db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
               db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
-pw_0to31:      dw  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
-               dw 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+pw_31to0:      dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+               dw 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
+pw_1to32:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+               dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
 z_upsample:    dw  0, -1,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6
               dw  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
 z_xpos_mul:    dw  1,  1,  1,  1,  2,  2,  1,  1,  3,  3,  2,  2,  4,  4,  2,  2
               dw  5,  5,  3,  3,  6,  6,  3,  3,  7,  7,  4,  4,  8,  8,  4,  4
+z_ypos_mul:    dw  0,  0,  0,  0,  1,  1,  0,  0,  2,  2,  1,  1,  3,  3,  1,  1
+               dw  4,  4,  2,  2,  5,  5,  2,  2,  6,  6,  3,  3,  7,  7,  3,  3
 z_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
 z_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
 z_xpos_off1a:  dw  30720,  30784,  30848,  30912,  30976,  31040,  31104,  31168
@ -75,13 +79,25 @@ z_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
 z_filter_k:    dw  8,  8,  6,  6,  4,  4
               dw  4,  4,  5,  5,  4,  4
               dw  0,  0,  0,  0,  2,  2
+pw_15:         times 2 dw 15
+pw_16:         times 2 dw 16
 pw_17:         times 2 dw 17
+pw_24:         times 2 dw 24
+pw_32:         times 2 dw 32
 pw_63:         times 2 dw 63
+pw_64:         times 2 dw 64
 pw_512:        times 2 dw 512
 pw_31806:      times 2 dw 31806
+pw_32640:      times 2 dw 32640
+pw_32672:      times 2 dw 32672
+pw_32704:      times 2 dw 32704
+pw_32735:      times 2 dw 32735
+pw_32736:      times 2 dw 32736

+%define pw_2 (z_xpos_mul+4* 2)
 %define pw_3 (z_xpos_mul+4* 4)
 %define pw_7 (z_xpos_mul+4*12)
+%define pw_0to31 (pw_1to32-2)

 %macro JMP_TABLE 3-*
    %xdefine %1_%2_table (%%table - 2*4)
@ -98,6 +114,7 @@ JMP_TABLE ipred_smooth_16bpc,     avx512icl, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_h_16bpc,   avx512icl, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_v_16bpc,   avx512icl, w4, w8, w16, w32, w64
 JMP_TABLE ipred_z1_16bpc,         avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc,         avx512icl, w4, w8, w16, w32, w64
 JMP_TABLE pal_pred_16bpc,         avx512icl, w4, w8, w16, w32, w64

 cextern smooth_weights_1d_16bpc
@ -757,7 +774,7 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
    lea                 r3d, [angleq+216]
    movu                ym5, [tlq]
    mov                 r3b, hb
-    mova                m10, [base+pw_0to31]
+    movu                m10, [base+pw_0to31]
    cmp                 r3d, 8
    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
    lea                 r3d, [hq+7]
@ -1157,6 +1174,638 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
    mov                 rsp, r7
    RET

+cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
+    lea                  r7, [z_filter_t0]
+    tzcnt                wd, wm
+    movifnidn        angled, anglem
+    lea                  t0, [dr_intra_derivative+45*2-1]
+    movsxd               wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
+    sub              angled, 180
+    mov                 dyd, angled
+    neg                 dyd
+    xor              angled, 0x400
+    or                  dyq, ~0x7e
+    mova                 m0, [base+pw_31to0]
+    movzx               dyd, word [t0+dyq]
+    lea                  wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
+    movifnidn            hd, hm
+    vpbroadcastd        m14, [base+pw_31806]
+    vpbroadcastd        m15, [base+pw_1]
+    jmp                  wq
+.w4:
+    lea                 r3d, [hq+3]
+    xor                 r3d, 31 ; 32 - (h + imin(w, h))
+    vpbroadcastw         m7, r3d
+    pmaxuw               m7, m0
+    vpermw               m6, m7, [tlq-64*1]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w4_main
+    cmp              angleb, 40
+    jae .w4_filter
+    lea                 r3d, [angleq-1024]
+    sar                 r3d, 7
+    add                 r3d, hd
+    jg .w4_filter ; h > 8 || (h == 8 && is_sm)
+    call .upsample
+    movsldup             m1, [base+z_ypos_mul]
+    paddw                m1, m1
+    jmp .w4_main2
+.w4_filter:
+    lea                 r3d, [hq+3]
+    call .filter32
+.w4_main:
+    movsldup             m1, [base+z_ypos_mul]
+.w4_main2:
+    vpbroadcastq         m0, [base+pw_1to32]
+    vpbroadcastw         m4, dyd
+    lea                 r2d, [hq+4]
+    shr                 r2d, 3
+    pmullw               m4, m0      ; ypos
+    vpbroadcastw         m0, r2d
+    imul                 r2, strideq ; stride * imax(height / 8, 1)
+    pmullw               m1, m0
+    lea                  r3, [r2*3]
+    paddd                m1, [base+pw_32736] {1to16}
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1      ; base+0
+    vpandd               m4, m14     ; frac << 9
+    vpermw               m3, m2, m6  ; left[base+0]
+.w4_loop:
+    paddsw               m2, m15     ; base+1
+    vpermw               m1, m2, m6  ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddw                m0, m3
+    movq        [dstq+r2*0], xm0
+    movhps      [dstq+r2*1], xm0
+    vextracti32x4       xm3, ym0, 1
+    movq        [dstq+r2*2], xm3
+    movhps      [dstq+r3  ], xm3
+    sub                  hd, 8
+    jl .w4_end
+    lea                  r5, [dstq+r2*4]
+    vextracti32x8       ym0, m0, 1
+    mova                 m3, m1
+    movq          [r5+r2*0], xm0
+    movhps        [r5+r2*1], xm0
+    vextracti32x4       xm1, ym0, 1
+    movq          [r5+r2*2], xm1
+    movhps        [r5+r3  ], xm1
+    add                dstq, strideq
+    test                 hd, hd
+    jnz .w4_loop
+.w4_end:
+    RET
+.upsample:
+    vinserti32x4         m6, [tlq-14], 3
+    mova                 m3, [base+z_upsample]
+    vpbroadcastd         m4, [base+pd_65536]
+    add                 dyd, dyd
+    vpermw               m0, m3, m6
+    paddw                m3, m4
+    vpermw               m1, m3, m6
+    paddw                m3, m4
+    vpermw               m2, m3, m6
+    paddw                m3, m4
+    vpermw               m3, m3, m6
+    vpbroadcastw         m6, r9m     ; pixel_max
+    paddw                m1, m2      ; b+c
+    paddw                m0, m3      ; a+d
+    psubw                m0, m1, m0
+    psraw                m0, 3
+    pxor                 m2, m2
+    paddw                m0, m1
+    pmaxsw               m0, m2
+    pavgw                m0, m2
+    pminsw               m6, m0
+    ret
+.w8:
+    mova                 m6, [tlq-64*1]
+    cmp                  hd, 32
+    je .w8_h32
+    mov                 r3d, 8
+    cmp                  hd, 4
+    cmove               r3d, hd
+    lea                 r3d, [r3+hq-1]
+    xor                 r3d, 31 ; 32 - (h + imin(w, h))
+    vpbroadcastw         m1, r3d
+    vpermw               m7, m1, m6
+    pmaxuw               m1, m0
+    vpermw               m6, m1, m6
+    test             angled, 0x400
+    jnz .w8_main
+    lea                 r3d, [angleq+216]
+    mov                 r3b, hb
+    cmp                 r3d, 8
+    ja .w8_filter ; is_sm || d >= 40 || h > 8
+    call .upsample
+    movshdup             m1, [base+z_ypos_mul]
+    paddw                m1, m1
+    call .w8_main_setup
+.w8_upsample_loop:
+    vpermw               m3, m2, m6  ; left[base+0]
+    paddw                m2, m15     ; base+1
+    vpermw               m1, m2, m6  ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddw                m2, m15     ; base+2
+    paddw                m0, m3
+    mova                 m3, m1
+    mova          [dstq+r2*0], xm0
+    vextracti32x4 [dstq+r2*1], ym0, 1
+    vextracti32x4 [dstq+r2*2], m0, 2
+    vextracti32x4 [dstq+r3  ], m0, 3
+    add                dstq, strideq
+    sub                  hd, 4
+    jg .w8_upsample_loop
+    RET
+.w8_main_setup:
+    vbroadcasti32x4      m0, [base+pw_1to32]
+    vpbroadcastw         m4, dyd
+    rorx                r2d, hd, 2
+    pmullw               m4, m0      ; ypos
+    vpbroadcastw         m0, r2d
+    imul                 r2, strideq ; stride * height / 4
+    lea                  r3, [r2*3]
+    pmullw               m1, m0      ; 0 1 2 3
+    paddd                m1, [base+pw_32704] {1to16}
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1      ; base+0
+    vpandd               m4, m14     ; frac << 9
+    ret
+.w8_h32:
+    pmaxud               m7, m0, [base+pw_24] {1to16}
+    vpermw               m6, m0, m6
+    vpermw               m7, m7, [tlq-64*2]
+    test             angled, 0x400
+    jnz .w8_main
+    call .filter64
+    vpbroadcastd         m0, [base+pw_7]
+    pminuw               m0, [base+pw_0to31]
+    vpermw               m7, m0, m7
+    jmp .w8_main
+.w8_filter:
+    lea                 r3d, [hq+7]
+    call .filter32
+.w8_main:
+    movshdup             m1, [base+z_ypos_mul]
+    call .w8_main_setup
+    mova                 m3, m6
+    vpermt2w             m3, m2, m7  ; left[base+0]
+.w8_loop:
+    paddsw               m2, m15     ; base+1
+    mova                 m1, m6
+    vpermt2w             m1, m2, m7  ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddw                m0, m3
+    mova                 m3, m1
+    mova          [dstq+r2*0], xm0
+    vextracti32x4 [dstq+r2*1], ym0, 1
+    vextracti32x4 [dstq+r2*2], m0, 2
+    vextracti32x4 [dstq+r3  ], m0, 3
+    add                dstq, strideq
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.filter32:
+    vpbroadcastb       ym10, r3d
+    vpbroadcastb        ym1, angled
+    shr              angled, 8
+    vpcmpeqb             k1, ym10, [base+z_filter_wh]
+    mova                xm2, [base+z_filter_t0+angleq*8]
+    vpcmpgtb         k1{k1}, ym1, ym2
+    kmovd               r5d, k1
+    test                r5d, r5d
+    jz .filter32_end
+    vpbroadcastw         m2, [tlq]
+    popcnt              r5d, r5d
+    vpbroadcastd         m5, [base+z_filter_k+(r5-1)*4+12*0]
+    valignq              m2, m6, m2, 6
+    vpbroadcastd         m8, [base+z_filter_k+(r5-1)*4+12*1]
+    valignq              m4, m7, m6, 2
+    vpbroadcastd         m9, [base+z_filter_k+(r5-1)*4+12*2]
+    palignr              m1, m6, m2, 14
+    pmullw               m5, m6
+    palignr              m3, m4, m6, 2
+    paddw                m1, m3
+    palignr              m2, m6, m2, 12
+    pmullw               m1, m8
+    palignr              m4, m6, 4
+    paddw                m2, m4
+    pmullw               m2, m9
+    pmovzxbw            m10, ym10
+    pxor                 m6, m6
+    paddw                m5, m1
+    pminuw               m1, m10, [base+pw_0to31]
+    paddw                m5, m2
+    psrlw                m5, 3
+    pavgw                m6, m5
+    vpermw               m7, m10, m6
+    vpermw               m6, m1, m6
+.filter32_end:
+    ret
+.w16:
+    mova                 m6, [tlq-64*1]
+    cmp                  hd, 32
+    jl .w16_h16
+    pmaxud               m8, m0, [base+pw_16] {1to16}
+    mova                 m7, [tlq-64*2]
+    vpermw               m6, m0, m6
+    jg .w16_h64
+    vpermw               m7, m8, m7
+    test             angled, 0x400
+    jnz .w16_main
+    call .filter64
+    vpbroadcastd         m0, [base+pw_15]
+    vinserti32x8         m0, [base+pw_0to31], 0
+    vpermw               m7, m0, m7
+    jmp .w16_main
+.w16_h16:
+    lea                 r3d, [hq*2-1]
+    xor                 r3d, 31 ; 32 - (h + imin(w, h))
+    vpbroadcastw         m1, r3d
+    vpermw               m7, m1, m6
+    pmaxuw               m1, m0
+    vpermw               m6, m1, m6
+    test             angled, 0x400
+    jnz .w16_main
+    lea                 r3d, [hq+15]
+    call .filter32
+.w16_main:
+    vbroadcasti32x8      m0, [base+pw_1to32]
+    vpbroadcastw         m4, dyd
+    rorx                r2d, hd, 1
+    pmullw               m4, m0      ; ypos
+    vpbroadcastw        ym1, r2d
+    imul                 r2, strideq ; stride * height / 2
+    paddd                m1, [base+pw_32704] {1to16}
+    lea                  r3, [r2+strideq]
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1      ; base+0
+    vpandd               m4, m14     ; frac << 9
+    mova                 m3, m6
+    vpermt2w             m3, m2, m7  ; left[base+0]
+.w16_loop:
+    paddsw               m1, m2, m15 ; base+1
+    paddsw               m2, m1, m15 ; base+2
+    vpermi2w             m1, m6, m7  ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddw                m0, m3
+    mova                 m3, m6
+    vpermt2w             m3, m2, m7  ; left[base+2]
+    vextracti32x8 [dstq+strideq*0], m0, 1
+    mova          [dstq+r2       ], ym0
+    psubw                m0, m3, m1
+    pmulhrsw             m0, m4
+    paddw                m0, m1
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    mova          [dstq+r3       ], ym0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w16_h64:
+    vpermw               m7, m0, m7
+    vpermw               m8, m8, [tlq-64*3]
+    test             angled, 0x400
+    jnz .w16_h64_main
+    valignq             m11, m8, m7, 6
+    call .filter64
+    vshufi32x4           m2, m8, m8, q3321
+    vpbroadcastd         m0, [base+pw_15]
+    palignr             ym3, ym8, ym11, 12
+    vinserti32x8         m0, [base+pw_0to31], 0
+    palignr             ym4, ym8, ym11, 14
+    palignr             ym1, ym2, ym8, 4
+    paddw               ym3, ym5
+    palignr             ym2, ym8, 2
+    paddw               ym8, ym4
+    pavgw               ym3, ym1
+    paddw               ym8, ym2
+    paddw               ym8, ym3
+    psrlw               ym8, 2
+    vpermw               m8, m0, m8
+.w16_h64_main:
+    vbroadcasti32x8      m0, [base+pw_1to32]
+    vpbroadcastw         m4, dyd
+    pmullw               m4, m0    ; ypos
+    vpbroadcastd        ym1, [base+pw_32]
+    paddd                m1, [base+pw_32672] {1to16}
+    mov                  r2, strideq
+    shl                  r2, 5      ; stride*32
+    vpbroadcastd         m9, [base+pw_32735]
+    lea                  r3, [r2+strideq]
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1     ; base+0
+    vpandd               m4, m14    ; frac << 9
+    mova                 m3, m7
+    vpermt2w             m3, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m3{k1}, m2, m8 ; left[base+0]
+.w16_h64_loop:
+    paddsw               m2, m15    ; base+1
+    mova                 m1, m7
+    vpermt2w             m1, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m1{k1}, m2, m8 ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddsw               m2, m15    ; base+2
+    paddw                m0, m3
+    mova                 m3, m7
+    vpermt2w             m3, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m3{k1}, m2, m8 ; left[base+2]
+    vextracti32x8 [dstq+strideq*0], m0, 1
+    mova          [dstq+r2       ], ym0
+    psubw                m0, m3, m1
+    pmulhrsw             m0, m4
+    paddw                m0, m1
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    mova          [dstq+r3       ], ym0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 4
+    jg .w16_h64_loop
+    RET
+.filter64:
+    vpbroadcastw         m2, [tlq]
+    vpbroadcastd         m5, [base+pw_3]
+    valignq              m2, m6, m2, 6
+    valignq              m4, m7, m6, 2
+    valignq             m10, m7, m6, 6
+    palignr              m1, m6, m2, 12
+    palignr              m2, m6, m2, 14
+    palignr              m3, m4, m6, 4
+    paddw                m1, m5
+    palignr              m4, m6, 2
+    paddw                m6, m2
+    valignq              m2, m8, m7, 2
+    pavgw                m1, m3
+    palignr              m3, m7, m10, 12
+    paddw                m6, m4
+    palignr              m4, m7, m10, 14
+    paddw                m6, m1
+    palignr              m1, m2, m7, 4
+    psrlw                m6, 2
+    palignr              m2, m7, 2
+    paddw                m3, m5
+    paddw                m7, m4
+    pavgw                m3, m1
+    paddw                m7, m2
+    paddw                m7, m3
+    psrlw                m7, 2
+    ret
+.w32:
+    mova                 m6, [tlq-64*1]
+    cmp                  hd, 32
+    jl .w32_h16
+    mova                 m8, [tlq-64*2]
+    vpermw               m6, m0, m6
+    vpermw               m7, m0, m8
+    jg .w32_h64
+    test             angled, 0x400
+    jnz .w32_main
+    vpbroadcastw        xm8, xm8
+    jmp .w32_filter
+.w32_h16:
+    lea                 r3d, [hq*2-1]
+    xor                 r3d, 31 ; 32 - (h + imin(w, h))
+    vpbroadcastw         m1, r3d
+    vpermw               m7, m1, m6
+    pmaxuw               m1, m0
+    vpermw               m6, m1, m6
+    test             angled, 0x400
+    jnz .w32_main
+    vextracti32x4       xm8, m7, 3
+.w32_filter:
+    call .filter64
+.w32_main:
+    vpbroadcastw         m4, dyd
+    vpbroadcastd         m1, [base+pw_32704]
+    pmullw               m4, [base+pw_1to32] ; ypos
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1      ; base+0
+    vpandd               m4, m14     ; frac << 9
+    mova                 m3, m6
+    vpermt2w             m3, m2, m7  ; left[base+0]
+.w32_loop:
+    paddsw               m1, m2, m15 ; base+1
+    paddsw               m2, m1, m15 ; base+2
+    vpermi2w             m1, m6, m7  ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddw                m0, m3
+    mova                 m3, m6
+    vpermt2w             m3, m2, m7  ; left[base+2]
+    mova   [dstq+strideq*0], m0
+    psubw                m0, m3, m1
+    pmulhrsw             m0, m4
+    paddw                m0, m1
+    mova   [dstq+strideq*1], m0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w32_h64:
+    mova                 m9, [tlq-64*3]
+    vpermw               m8, m0, m9
+    test             angled, 0x400
+    jnz .w32_h64_main
+    vpbroadcastw        xm9, xm9
+    call .filter96
+.w32_h64_main:
+    vpbroadcastw         m4, dyd
+    vpbroadcastd         m1, [base+pw_32672]
+    pmullw               m4, [base+pw_1to32] ; ypos
+    vpbroadcastd         m9, [base+pw_32735]
+    psrlw                m2, m4, 6
+    psllw                m4, 9
+    paddsw               m2, m1     ; base+0
+    vpandd               m4, m14    ; frac << 9
+    mova                 m3, m7
+    vpermt2w             m3, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m3{k1}, m2, m8 ; left[base+0]
+.w32_h64_loop:
+    paddsw               m2, m15    ; base+1
+    mova                 m1, m7
+    vpermt2w             m1, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m1{k1}, m2, m8 ; left[base+1]
+    psubw                m0, m1, m3
+    pmulhrsw             m0, m4
+    paddsw               m2, m15    ; base+2
+    paddw                m0, m3
+    mova                 m3, m7
+    vpermt2w             m3, m2, m6
+    vpcmpgtw             k1, m2, m9
+    vpermw           m3{k1}, m2, m8 ; left[base+2]
+    mova   [dstq+strideq*0], m0
+    psubw                m0, m3, m1
+    pmulhrsw             m0, m4
+    paddw                m0, m1
+    mova   [dstq+strideq*1], m0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_h64_loop
+    RET
+.filter96:
+    valignq             m11, m8, m7, 6
+    call .filter64
+    valignq              m2, m9, m8, 2
+    palignr              m3, m8, m11, 12
+    palignr              m4, m8, m11, 14
+    palignr              m1, m2, m8, 4
+    paddw                m3, m5
+    palignr              m2, m8, 2
+    paddw                m8, m4
+    pavgw                m3, m1
+    paddw                m8, m2
+    paddw                m8, m3
+    psrlw                m8, 2
+    ret
+.w64:
+    mova                 m7, [tlq-64*1]
+    vpermw               m6, m0, m7
+    cmp                  hd, 32
+    jl .w64_h16
+    mova                 m8, [tlq-64*2]
+    vpermw               m7, m0, m8
+    jg .w64_h64
+    test             angled, 0x400
+    jnz .w64_main
+    vpbroadcastw         m8, xm8
+    mova                 m9, m8
+    call .filter96
+    vshufi32x4           m9, m8, m8, q3333
+    jmp .w64_h64_main
+.w64_h16:
+    vpbroadcastw         m7, xm7
+    test             angled, 0x400
+    jnz .w64_main
+    mova                 m8, m7
+    call .filter64
+.w64_main:
+    vpbroadcastw        m11, dyd
+    vpbroadcastd         m1, [base+pw_32704]
+    pmullw              m10, m11, [base+pw_1to32] ; ypos
+    psllw               m11, 5
+    psrlw                m8, m10, 6
+    paddw               m11, m10
+    psllw               m10, 9
+    psrlw                m9, m11, 6
+    psllw               m11, 9
+    psubw                m9, m8
+    paddsw               m8, m1     ; base+0
+    vpandd              m10, m14    ; frac << 9
+    vpandd              m11, m14    ; frac << 9
+    mova                 m4, m6
+    vpermt2w             m4, m8, m7 ; left[base+0] ( 0..31)
+    paddsw               m5, m8, m9
+    vpermi2w             m5, m6, m7 ; left[base+0] (32..63)
+.w64_loop:
+    paddsw               m8, m15    ; base+1      ( 0..31)
+    mova                 m2, m6
+    vpermt2w             m2, m8, m7 ; left[base+1] ( 0..31)
+    paddsw               m3, m8, m9 ; base+1      (32..63)
+    vpermi2w             m3, m6, m7 ; left[base+1] (32..63)
+    psubw                m0, m2, m4
+    psubw                m1, m3, m5
+    pmulhrsw             m0, m10
+    pmulhrsw             m1, m11
+    paddw                m0, m4
+    paddw                m1, m5
+    mova                 m4, m2
+    mova        [dstq+64*0], m0
+    mova                 m5, m3
+    mova        [dstq+64*1], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+.w64_h64:
+    vpermw               m8, m0, [tlq-64*3]
+    mova                m13, [tlq-64*4]
+    vpermw               m9, m0, m13
+    test             angled, 0x400
+    jnz .w64_h64_main
+    valignq             m12, m9, m8, 6
+    call .filter96
+    vpbroadcastw        xm2, xm13
+    valignq              m2, m9, 2
+    palignr              m3, m9, m12, 12
+    palignr              m4, m9, m12, 14
+    palignr              m1, m2, m9, 4
+    paddw                m3, m5
+    palignr              m2, m9, 2
+    paddw                m9, m4
+    pavgw                m3, m1
+    paddw                m9, m2
+    paddw                m9, m3
+    psrlw                m9, 2
+.w64_h64_main:
+    vpbroadcastw        m11, dyd
+    vpbroadcastd         m1, [base+pw_32640]
+    pmullw              m10, m11, [base+pw_1to32] ; ypos
+    psllw               m11, 5
+    psrlw               m12, m10, 6
+    paddw               m11, m10
+    psllw               m10, 9
+    psrlw               m13, m11, 6
+    psllw               m11, 9
+    psubw               m13, m12
+    paddsw              m12, m1     ; base+0
+    vpandd              m10, m14    ; frac << 9
+    vpandd              m11, m14    ; frac << 9
+    vpbroadcastd        m14, [base+pw_64]
+    mova                 m4, m6
+    vpermt2w             m4, m12, m7
+    vptestmw             k1, m12, m14
+    mova                 m0, m8
+    vpermt2w             m0, m12, m9
+    paddsw               m1, m12, m13
+    mova                 m5, m6
+    vpermt2w             m5, m1, m7
+    vptestmw             k2, m1, m14
+    vpermi2w             m1, m8, m9
+    vmovdqu16        m4{k1}, m0     ; left[base+0] ( 0..31)
+    vmovdqu16        m5{k2}, m1     ; left[base+0] (32..63)
+.w64_h64_loop:
+    paddsw              m12, m15    ; base+1
+    mova                 m2, m6
+    vpermt2w             m2, m12, m7
+    vptestmw             k1, m12, m14
+    mova                 m0, m8
+    vpermt2w             m0, m12, m9
+    paddsw               m1, m12, m13
+    mova                 m3, m6
+    vpermt2w             m3, m1, m7
+    vptestmw             k2, m1, m14
+    vpermi2w             m1, m8, m9
+    vmovdqu16        m2{k1}, m0     ; left[base+1] ( 0..31)
+    vmovdqu16        m3{k2}, m1     ; left[base+1] (32..63)
+    psubw                m0, m2, m4
+    psubw                m1, m3, m5
+    pmulhrsw             m0, m10
+    pmulhrsw             m1, m11
+    paddw                m0, m4
+    paddw                m1, m5
+    mova                 m4, m2
+    mova        [dstq+64*0], m0
+    mova                 m5, m3
+    mova        [dstq+64*1], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_h64_loop
+    RET
+
 cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
    lea                  r6, [pal_pred_16bpc_avx512icl_table]
    tzcnt                wd, wm
--- a/third_party/dav1d/tests/dav1d_argon.bash
+++ b/third_party/dav1d/tests/dav1d_argon.bash
@ -132,7 +132,8 @@ for d in "${dirs[@]}"; do
    fi
 done

-if [ ${#files[@]} -eq 0 ]; then
+num_files="${#files[@]}"
+if [ "$num_files" -eq 0 ]; then
    error "Error! No files found at ${dirs[*]}"
 fi

@ -148,17 +149,17 @@ for i in "${!files[@]}"; do
    md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}"
    md5=${md5/ */}

-    printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f"
+    printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}"
    cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
    if [ "$JOBS" -gt 1 ]; then
        "${cmd[@]}" 2>/dev/null &
        p=$!
        pids+=("$p")
-        declare "file$p=$f"
+        declare "file$p=${f#"$ARGON_DIR"/}"
        block_pids
    else
        if ! "${cmd[@]}" 2>/dev/null; then
-            fail "$f"
+            fail "${f#"$ARGON_DIR"/}"
        fi
    fi
 done
@ -166,9 +167,9 @@ done
 wait_all_pids

 if [ "$failed" -ne 0 ]; then
-    printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}"
+    printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files"
 else
-    printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}"
+    printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files"
 fi
 printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info"

--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -69,6 +69,8 @@ if is_asm_enabled
        checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
    elif host_machine.cpu_family().startswith('arm')
        checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
+    elif host_machine.cpu_family() == 'riscv64'
+        checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S')
    elif host_machine.cpu_family().startswith('x86')
        checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
    endif
@ -128,7 +130,7 @@ endforeach
 subdir('libfuzzer')

 # seek stress test binary, depends on dav1d cli tool
-if get_option('enable_tools')
+if (get_option('enable_tools') and get_option('enable_seek_stress'))
    seek_stress_sources = files('seek_stress.c')
    seek_stress = executable('seek_stress',
        seek_stress_sources, rev_target,