зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1875883 - Update dav1d to a6878be7e07114f5a2915ad46300700f0db55197 r=media-playback-reviewers,padenot
Differential Revision: https://phabricator.services.mozilla.com/D200241
This commit is contained in:
Родитель
80236c4ec3
Коммит
8966466027
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc (2023-12-19T13:15:43.000+01:00).
|
||||
release: a6878be7e07114f5a2915ad46300700f0db55197 (2024-01-31T06:04:21.000-05:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc
|
||||
revision: a6878be7e07114f5a2915ad46300700f0db55197
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc"
|
||||
#define DAV1D_VERSION "a6878be7e07114f5a2915ad46300700f0db55197"
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
#define ALIGN_64_VAL 64
|
||||
#define ALIGN_32_VAL 32
|
||||
#define ALIGN_16_VAL 16
|
||||
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
|
||||
#elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32
|
||||
/* ARM doesn't benefit from anything more than 16-byte alignment. */
|
||||
#define ALIGN_64_VAL 16
|
||||
#define ALIGN_32_VAL 16
|
||||
|
|
|
@ -62,11 +62,13 @@ endforeach
|
|||
|
||||
# ASM option
|
||||
is_asm_enabled = (get_option('enable_asm') == true and
|
||||
(host_machine.cpu_family() == 'x86' or
|
||||
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
|
||||
host_machine.cpu_family() == 'aarch64' or
|
||||
(host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm') or
|
||||
host_machine.cpu() == 'ppc64le'))
|
||||
host_machine.cpu() == 'ppc64le' or
|
||||
host_machine.cpu_family().startswith('riscv') or
|
||||
host_machine.cpu_family().startswith('loongarch') or
|
||||
host_machine.cpu_family() == 'x86' or
|
||||
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '')))
|
||||
cdata.set10('HAVE_ASM', is_asm_enabled)
|
||||
|
||||
if is_asm_enabled and get_option('b_sanitize') == 'memory'
|
||||
|
@ -232,7 +234,9 @@ endif
|
|||
|
||||
if (host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm') or
|
||||
host_machine.cpu() == 'ppc64le')
|
||||
host_machine.cpu_family().startswith('loongarch') or
|
||||
host_machine.cpu() == 'ppc64le' or
|
||||
host_machine.cpu_family().startswith('riscv'))
|
||||
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
|
||||
cdata.set('HAVE_GETAUXVAL', 1)
|
||||
endif
|
||||
|
@ -379,6 +383,14 @@ endif
|
|||
|
||||
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
|
||||
|
||||
cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv'))
|
||||
cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32')
|
||||
cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64')
|
||||
|
||||
cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch'))
|
||||
cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32')
|
||||
cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64')
|
||||
|
||||
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
|
||||
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
|
||||
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
|
||||
|
|
|
@ -25,6 +25,11 @@ option('enable_tests',
|
|||
value: true,
|
||||
description: 'Build dav1d tests')
|
||||
|
||||
option('enable_seek_stress',
|
||||
type: 'boolean',
|
||||
value: false,
|
||||
description: 'Build seek_stress test tool')
|
||||
|
||||
option('enable_docs',
|
||||
type: 'boolean',
|
||||
value: false,
|
||||
|
|
|
@ -56,8 +56,12 @@ COLD void dav1d_init_cpu(void) {
|
|||
// memory sanitizer is inherently incompatible with asm
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
|
||||
#elif ARCH_LOONGARCH
|
||||
dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
|
||||
#elif ARCH_PPC64LE
|
||||
dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
|
||||
#elif ARCH_RISCV
|
||||
dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
|
||||
#elif ARCH_X86
|
||||
dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
|
||||
#endif
|
||||
|
|
|
@ -37,8 +37,12 @@
|
|||
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/cpu.h"
|
||||
#elif ARCH_LOONGARCH
|
||||
#include "src/loongarch/cpu.h"
|
||||
#elif ARCH_PPC64LE
|
||||
#include "src/ppc/cpu.h"
|
||||
#elif ARCH_RISCV
|
||||
#include "src/riscv/cpu.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/cpu.h"
|
||||
#endif
|
||||
|
@ -64,6 +68,10 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
|
|||
#if defined(__VSX__)
|
||||
flags |= DAV1D_PPC_CPU_FLAG_VSX;
|
||||
#endif
|
||||
#elif ARCH_RISCV
|
||||
#if defined(__riscv_v)
|
||||
flags |= DAV1D_RISCV_CPU_FLAG_V;
|
||||
#endif
|
||||
#elif ARCH_X86
|
||||
#if defined(__AVX512F__) && defined(__AVX512CD__) && \
|
||||
defined(__AVX512BW__) && defined(__AVX512DQ__) && \
|
||||
|
|
|
@ -183,6 +183,10 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/itx.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/itx.h"
|
||||
#elif ARCH_RISCV
|
||||
#include "src/riscv/itx.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/itx.h"
|
||||
#endif
|
||||
|
@ -257,6 +261,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
|
|||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
itx_dsp_init_arm(c, bpc);
|
||||
#endif
|
||||
#if ARCH_LOONGARCH64
|
||||
itx_dsp_init_loongarch(c, bpc);
|
||||
#endif
|
||||
#if ARCH_RISCV
|
||||
itx_dsp_init_riscv(c, bpc);
|
||||
#endif
|
||||
#if ARCH_X86
|
||||
itx_dsp_init_x86(c, bpc);
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "common/attributes.h"
|
||||
#include "src/loongarch/cpu.h"
|
||||
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
#include <sys/auxv.h>
|
||||
|
||||
#define LA_HWCAP_LSX ( 1 << 4 )
|
||||
#define LA_HWCAP_LASX ( 1 << 5 )
|
||||
#endif
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
|
||||
unsigned flags = 0;
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
|
||||
flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_CPU_H
|
||||
#define DAV1D_SRC_LOONGARCH_CPU_H
|
||||
|
||||
enum CpuFlags {
|
||||
DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0,
|
||||
DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1,
|
||||
};
|
||||
|
||||
unsigned dav1d_get_cpu_flags_loongarch(void);
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_CPU_H */
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_ITX_H
|
||||
#define DAV1D_SRC_LOONGARCH_ITX_H
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/itx.h"
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
|
||||
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
|
||||
|
||||
static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
|
||||
#if BITDEPTH == 8
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
|
||||
|
||||
if (BITDEPTH != 8 ) return;
|
||||
|
||||
c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
|
||||
c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
|
||||
c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
|
||||
c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
|
||||
c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
|
||||
c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
|
||||
c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx;
|
||||
c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
|
||||
c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
|
||||
c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
|
||||
|
||||
c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_ITX_H */
|
|
@ -0,0 +1,776 @@
|
|||
/*********************************************************************
|
||||
* Copyright (c) 2022 Loongson Technology Corporation Limited
|
||||
* Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
|
||||
* Shiyou Yin(yinshiyou-hf@loongson.cn)
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*********************************************************************/
|
||||
|
||||
/*
|
||||
* This file is a LoongArch assembly helper file and available under ISC
|
||||
* license. It provides a large number of macros and alias to simplify
|
||||
* writing assembly code, especially for LSX and LASX optimizations.
|
||||
*
|
||||
* Any one can modify it or add new features for his/her own purposes.
|
||||
* Contributing a patch will be appreciated as it might be useful for
|
||||
* others as well. Send patches to loongson contributor mentioned above.
|
||||
*
|
||||
* MAJOR version: Usage changes, incompatible with previous version.
|
||||
* MINOR version: Add new macros/functions, or bug fixes.
|
||||
* MICRO version: Comment changes or implementation changes.
|
||||
*/
|
||||
|
||||
#define LML_VERSION_MAJOR 0
|
||||
#define LML_VERSION_MINOR 4
|
||||
#define LML_VERSION_MICRO 0
|
||||
|
||||
#define DEFAULT_ALIGN 5
|
||||
|
||||
/* Set prefix as needed. */
|
||||
#ifndef PRIVATE_PREFIX
|
||||
#define PRIVATE_PREFIX dav1d_
|
||||
#endif
|
||||
|
||||
#define PASTE(a,b) a ## b
|
||||
#define CONCAT(a,b) PASTE(a,b)
|
||||
|
||||
#ifdef PREFIX
|
||||
#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
|
||||
#else
|
||||
#define ASM_PREF PRIVATE_PREFIX
|
||||
#endif
|
||||
|
||||
.macro function name, align=DEFAULT_ALIGN
|
||||
.macro endfunc
|
||||
jirl $r0, $r1, 0x0
|
||||
.size ASM_PREF\name, . - ASM_PREF\name
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text ;
|
||||
.align \align ;
|
||||
.globl ASM_PREF\name ;
|
||||
.type ASM_PREF\name, @function ;
|
||||
ASM_PREF\name: ;
|
||||
.endm
|
||||
|
||||
.macro const name, align=DEFAULT_ALIGN
|
||||
.macro endconst
|
||||
.size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
.section .rodata
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LoongArch register alias
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
#define a0 $a0
|
||||
#define a1 $a1
|
||||
#define a2 $a2
|
||||
#define a3 $a3
|
||||
#define a4 $a4
|
||||
#define a5 $a5
|
||||
#define a6 $a6
|
||||
#define a7 $a7
|
||||
|
||||
#define t0 $t0
|
||||
#define t1 $t1
|
||||
#define t2 $t2
|
||||
#define t3 $t3
|
||||
#define t4 $t4
|
||||
#define t5 $t5
|
||||
#define t6 $t6
|
||||
#define t7 $t7
|
||||
#define t8 $t8
|
||||
|
||||
#define s0 $s0
|
||||
#define s1 $s1
|
||||
#define s2 $s2
|
||||
#define s3 $s3
|
||||
#define s4 $s4
|
||||
#define s5 $s5
|
||||
#define s6 $s6
|
||||
#define s7 $s7
|
||||
#define s8 $s8
|
||||
|
||||
#define zero $zero
|
||||
#define sp $sp
|
||||
#define ra $ra
|
||||
|
||||
#define fa0 $fa0
|
||||
#define fa1 $fa1
|
||||
#define fa2 $fa2
|
||||
#define fa3 $fa3
|
||||
#define fa4 $fa4
|
||||
#define fa5 $fa5
|
||||
#define fa6 $fa6
|
||||
#define fa7 $fa7
|
||||
#define ft0 $ft0
|
||||
#define ft1 $ft1
|
||||
#define ft2 $ft2
|
||||
#define ft3 $ft3
|
||||
#define ft4 $ft4
|
||||
#define ft5 $ft5
|
||||
#define ft6 $ft6
|
||||
#define ft7 $ft7
|
||||
#define ft8 $ft8
|
||||
#define ft9 $ft9
|
||||
#define ft10 $ft10
|
||||
#define ft11 $ft11
|
||||
#define ft12 $ft12
|
||||
#define ft13 $ft13
|
||||
#define ft14 $ft14
|
||||
#define ft15 $ft15
|
||||
#define fs0 $fs0
|
||||
#define fs1 $fs1
|
||||
#define fs2 $fs2
|
||||
#define fs3 $fs3
|
||||
#define fs4 $fs4
|
||||
#define fs5 $fs5
|
||||
#define fs6 $fs6
|
||||
#define fs7 $fs7
|
||||
|
||||
#define f0 $f0
|
||||
#define f1 $f1
|
||||
#define f2 $f2
|
||||
#define f3 $f3
|
||||
#define f4 $f4
|
||||
#define f5 $f5
|
||||
#define f6 $f6
|
||||
#define f7 $f7
|
||||
#define f8 $f8
|
||||
#define f9 $f9
|
||||
#define f10 $f10
|
||||
#define f11 $f11
|
||||
#define f12 $f12
|
||||
#define f13 $f13
|
||||
#define f14 $f14
|
||||
#define f15 $f15
|
||||
#define f16 $f16
|
||||
#define f17 $f17
|
||||
#define f18 $f18
|
||||
#define f19 $f19
|
||||
#define f20 $f20
|
||||
#define f21 $f21
|
||||
#define f22 $f22
|
||||
#define f23 $f23
|
||||
#define f24 $f24
|
||||
#define f25 $f25
|
||||
#define f26 $f26
|
||||
#define f27 $f27
|
||||
#define f28 $f28
|
||||
#define f29 $f29
|
||||
#define f30 $f30
|
||||
#define f31 $f31
|
||||
|
||||
#define vr0 $vr0
|
||||
#define vr1 $vr1
|
||||
#define vr2 $vr2
|
||||
#define vr3 $vr3
|
||||
#define vr4 $vr4
|
||||
#define vr5 $vr5
|
||||
#define vr6 $vr6
|
||||
#define vr7 $vr7
|
||||
#define vr8 $vr8
|
||||
#define vr9 $vr9
|
||||
#define vr10 $vr10
|
||||
#define vr11 $vr11
|
||||
#define vr12 $vr12
|
||||
#define vr13 $vr13
|
||||
#define vr14 $vr14
|
||||
#define vr15 $vr15
|
||||
#define vr16 $vr16
|
||||
#define vr17 $vr17
|
||||
#define vr18 $vr18
|
||||
#define vr19 $vr19
|
||||
#define vr20 $vr20
|
||||
#define vr21 $vr21
|
||||
#define vr22 $vr22
|
||||
#define vr23 $vr23
|
||||
#define vr24 $vr24
|
||||
#define vr25 $vr25
|
||||
#define vr26 $vr26
|
||||
#define vr27 $vr27
|
||||
#define vr28 $vr28
|
||||
#define vr29 $vr29
|
||||
#define vr30 $vr30
|
||||
#define vr31 $vr31
|
||||
|
||||
#define xr0 $xr0
|
||||
#define xr1 $xr1
|
||||
#define xr2 $xr2
|
||||
#define xr3 $xr3
|
||||
#define xr4 $xr4
|
||||
#define xr5 $xr5
|
||||
#define xr6 $xr6
|
||||
#define xr7 $xr7
|
||||
#define xr8 $xr8
|
||||
#define xr9 $xr9
|
||||
#define xr10 $xr10
|
||||
#define xr11 $xr11
|
||||
#define xr12 $xr12
|
||||
#define xr13 $xr13
|
||||
#define xr14 $xr14
|
||||
#define xr15 $xr15
|
||||
#define xr16 $xr16
|
||||
#define xr17 $xr17
|
||||
#define xr18 $xr18
|
||||
#define xr19 $xr19
|
||||
#define xr20 $xr20
|
||||
#define xr21 $xr21
|
||||
#define xr22 $xr22
|
||||
#define xr23 $xr23
|
||||
#define xr24 $xr24
|
||||
#define xr25 $xr25
|
||||
#define xr26 $xr26
|
||||
#define xr27 $xr27
|
||||
#define xr28 $xr28
|
||||
#define xr29 $xr29
|
||||
#define xr30 $xr30
|
||||
#define xr31 $xr31
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LSX/LASX synthesize instructions
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* Description : Dot product of byte vector elements
|
||||
* Arguments : Inputs - vj, vk
|
||||
* Outputs - vd
|
||||
* Return Type - halfword
|
||||
*/
|
||||
.macro vdp2.h.bu vd, vj, vk
|
||||
vmulwev.h.bu \vd, \vj, \vk
|
||||
vmaddwod.h.bu \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2.h.bu.b vd, vj, vk
|
||||
vmulwev.h.bu.b \vd, \vj, \vk
|
||||
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2.w.h vd, vj, vk
|
||||
vmulwev.w.h \vd, \vj, \vk
|
||||
vmaddwod.w.h \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.h.bu xd, xj, xk
|
||||
xvmulwev.h.bu \xd, \xj, \xk
|
||||
xvmaddwod.h.bu \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.h.bu.b xd, xj, xk
|
||||
xvmulwev.h.bu.b \xd, \xj, \xk
|
||||
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.w.h xd, xj, xk
|
||||
xvmulwev.w.h \xd, \xj, \xk
|
||||
xvmaddwod.w.h \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Dot product & addition of halfword vector elements
|
||||
* Arguments : Inputs - vj, vk
|
||||
* Outputs - vd
|
||||
* Return Type - twice size of input
|
||||
*/
|
||||
.macro vdp2add.h.bu vd, vj, vk
|
||||
vmaddwev.h.bu \vd, \vj, \vk
|
||||
vmaddwod.h.bu \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2add.h.bu.b vd, vj, vk
|
||||
vmaddwev.h.bu.b \vd, \vj, \vk
|
||||
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2add.w.h vd, vj, vk
|
||||
vmaddwev.w.h \vd, \vj, \vk
|
||||
vmaddwod.w.h \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro xvdp2add.h.bu.b xd, xj, xk
|
||||
xvmaddwev.h.bu.b \xd, \xj, \xk
|
||||
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2add.w.h xd, xj, xk
|
||||
xvmaddwev.w.h \xd, \xj, \xk
|
||||
xvmaddwod.w.h \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Range element vj[i] to vk[i] ~ vj[i]
|
||||
* clip: vj > vk ? vj : vk && vj < va ? vj : va
|
||||
*/
|
||||
.macro vclip.h vd, vj, vk, va
|
||||
vmax.h \vd, \vj, \vk
|
||||
vmin.h \vd, \vd, \va
|
||||
.endm
|
||||
|
||||
.macro vclip.w vd, vj, vk, va
|
||||
vmax.w \vd, \vj, \vk
|
||||
vmin.w \vd, \vd, \va
|
||||
.endm
|
||||
|
||||
.macro xvclip.h xd, xj, xk, xa
|
||||
xvmax.h \xd, \xj, \xk
|
||||
xvmin.h \xd, \xd, \xa
|
||||
.endm
|
||||
|
||||
.macro xvclip.w xd, xj, xk, xa
|
||||
xvmax.w \xd, \xj, \xk
|
||||
xvmin.w \xd, \xd, \xa
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Range element vj[i] to 0 ~ 255
|
||||
* clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
|
||||
*/
|
||||
.macro vclip255.h vd, vj
|
||||
vmaxi.h \vd, \vj, 0
|
||||
vsat.hu \vd, \vd, 7
|
||||
.endm
|
||||
|
||||
.macro vclip255.w vd, vj
|
||||
vmaxi.w \vd, \vj, 0
|
||||
vsat.wu \vd, \vd, 7
|
||||
.endm
|
||||
|
||||
.macro xvclip255.h xd, xj
|
||||
xvmaxi.h \xd, \xj, 0
|
||||
xvsat.hu \xd, \xd, 7
|
||||
.endm
|
||||
|
||||
.macro xvclip255.w xd, xj
|
||||
xvmaxi.w \xd, \xj, 0
|
||||
xvsat.wu \xd, \xd, 7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Store elements of vector
|
||||
* vd : Data vector to be stroed
|
||||
* rk : Address of data storage
|
||||
* ra : Offset of address
|
||||
* si : Index of data in vd
|
||||
*/
|
||||
.macro vstelmx.b vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.b \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.h vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.h \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.w vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.w \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.d vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.d \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vmov xd, xj
|
||||
vor.v \xd, \xj, \xj
|
||||
.endm
|
||||
|
||||
.macro xmov xd, xj
|
||||
xvor.v \xd, \xj, \xj
|
||||
.endm
|
||||
|
||||
.macro xvstelmx.d xd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
xvstelm.d \xd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LSX/LASX custom macros
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* Load 4 float, double, V128, v256 elements with stride.
|
||||
*/
|
||||
.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
fld.s \out0, \src, 0
|
||||
fldx.s \out1, \src, \stride
|
||||
fldx.s \out2, \src, \stride2
|
||||
fldx.s \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
fld.d \out0, \src, 0
|
||||
fldx.d \out1, \src, \stride
|
||||
fldx.d \out2, \src, \stride2
|
||||
fldx.d \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
vld \out0, \src, 0
|
||||
vldx \out1, \src, \stride
|
||||
vldx \out2, \src, \stride2
|
||||
vldx \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
xvld \out0, \src, 0
|
||||
xvldx \out1, \src, \stride
|
||||
xvldx \out2, \src, \stride2
|
||||
xvldx \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
vilvl.h \tmp0, \in1, \in0
|
||||
vilvl.h \tmp1, \in3, \in2
|
||||
vilvl.w \out0, \tmp1, \tmp0
|
||||
vilvh.w \out2, \tmp1, \tmp0
|
||||
vilvh.d \out1, \out0, \out0
|
||||
vilvh.d \out3, \out0, \out2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Details :
|
||||
* Example :
|
||||
* 1, 2, 3, 4 1, 5, 9,13
|
||||
* 5, 6, 7, 8 to 2, 6,10,14
|
||||
* 9,10,11,12 =====> 3, 7,11,15
|
||||
* 13,14,15,16 4, 8,12,16
|
||||
*/
|
||||
.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
|
||||
vilvl.w \tmp0, \in1, \in0
|
||||
vilvh.w \out1, \in1, \in0
|
||||
vilvl.w \tmp1, \in3, \in2
|
||||
vilvh.w \out3, \in3, \in2
|
||||
|
||||
vilvl.d \out0, \tmp1, \tmp0
|
||||
vilvl.d \out2, \out3, \out1
|
||||
vilvh.d \out3, \out3, \out1
|
||||
vilvh.d \out1, \tmp1, \tmp0
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
|
||||
out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
|
||||
tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
vilvl.h \tmp0, \in6, \in4
|
||||
vilvl.h \tmp1, \in7, \in5
|
||||
vilvl.h \tmp2, \in2, \in0
|
||||
vilvl.h \tmp3, \in3, \in1
|
||||
|
||||
vilvl.h \tmp4, \tmp1, \tmp0
|
||||
vilvh.h \tmp5, \tmp1, \tmp0
|
||||
vilvl.h \tmp6, \tmp3, \tmp2
|
||||
vilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
vilvh.h \tmp0, \in6, \in4
|
||||
vilvh.h \tmp1, \in7, \in5
|
||||
vilvh.h \tmp2, \in2, \in0
|
||||
vilvh.h \tmp3, \in3, \in1
|
||||
|
||||
vpickev.d \out0, \tmp4, \tmp6
|
||||
vpickod.d \out1, \tmp4, \tmp6
|
||||
vpickev.d \out2, \tmp5, \tmp7
|
||||
vpickod.d \out3, \tmp5, \tmp7
|
||||
|
||||
vilvl.h \tmp4, \tmp1, \tmp0
|
||||
vilvh.h \tmp5, \tmp1, \tmp0
|
||||
vilvl.h \tmp6, \tmp3, \tmp2
|
||||
vilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
vpickev.d \out4, \tmp4, \tmp6
|
||||
vpickod.d \out5, \tmp4, \tmp6
|
||||
vpickev.d \out6, \tmp5, \tmp7
|
||||
vpickod.d \out7, \tmp5, \tmp7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 16x8 block with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
xvilvl.b \tmp0, \in2, \in0
|
||||
xvilvl.b \tmp1, \in3, \in1
|
||||
xvilvl.b \tmp2, \in6, \in4
|
||||
xvilvl.b \tmp3, \in7, \in5
|
||||
xvilvl.b \tmp4, \in10, \in8
|
||||
xvilvl.b \tmp5, \in11, \in9
|
||||
xvilvl.b \tmp6, \in14, \in12
|
||||
xvilvl.b \tmp7, \in15, \in13
|
||||
xvilvl.b \out0, \tmp1, \tmp0
|
||||
xvilvh.b \out1, \tmp1, \tmp0
|
||||
xvilvl.b \out2, \tmp3, \tmp2
|
||||
xvilvh.b \out3, \tmp3, \tmp2
|
||||
xvilvl.b \out4, \tmp5, \tmp4
|
||||
xvilvh.b \out5, \tmp5, \tmp4
|
||||
xvilvl.b \out6, \tmp7, \tmp6
|
||||
xvilvh.b \out7, \tmp7, \tmp6
|
||||
xvilvl.w \tmp0, \out2, \out0
|
||||
xvilvh.w \tmp2, \out2, \out0
|
||||
xvilvl.w \tmp4, \out3, \out1
|
||||
xvilvh.w \tmp6, \out3, \out1
|
||||
xvilvl.w \tmp1, \out6, \out4
|
||||
xvilvh.w \tmp3, \out6, \out4
|
||||
xvilvl.w \tmp5, \out7, \out5
|
||||
xvilvh.w \tmp7, \out7, \out5
|
||||
xvilvl.d \out0, \tmp1, \tmp0
|
||||
xvilvh.d \out1, \tmp1, \tmp0
|
||||
xvilvl.d \out2, \tmp3, \tmp2
|
||||
xvilvh.d \out3, \tmp3, \tmp2
|
||||
xvilvl.d \out4, \tmp5, \tmp4
|
||||
xvilvh.d \out5, \tmp5, \tmp4
|
||||
xvilvl.d \out6, \tmp7, \tmp6
|
||||
xvilvh.d \out7, \tmp7, \tmp6
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
xvilvl.h \tmp0, \in1, \in0
|
||||
xvilvl.h \tmp1, \in3, \in2
|
||||
xvilvl.w \out0, \tmp1, \tmp0
|
||||
xvilvh.w \out2, \tmp1, \tmp0
|
||||
xvilvh.d \out1, \out0, \out0
|
||||
xvilvh.d \out3, \out0, \out2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
xvilvl.h \tmp0, \in2, \in0
|
||||
xvilvl.h \tmp1, \in3, \in1
|
||||
xvilvl.h \out2, \tmp1, \tmp0
|
||||
xvilvh.h \out3, \tmp1, \tmp0
|
||||
|
||||
xvilvl.d \out0, \out2, \out2
|
||||
xvilvh.d \out1, \out2, \out2
|
||||
xvilvl.d \out2, \out3, \out3
|
||||
xvilvh.d \out3, \out3, \out3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7, \
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
xvilvl.h \tmp0, \in6, \in4
|
||||
xvilvl.h \tmp1, \in7, \in5
|
||||
xvilvl.h \tmp2, \in2, \in0
|
||||
xvilvl.h \tmp3, \in3, \in1
|
||||
|
||||
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
xvilvh.h \tmp0, \in6, \in4
|
||||
xvilvh.h \tmp1, \in7, \in5
|
||||
xvilvh.h \tmp2, \in2, \in0
|
||||
xvilvh.h \tmp3, \in3, \in1
|
||||
|
||||
xvpickev.d \out0, \tmp4, \tmp6
|
||||
xvpickod.d \out1, \tmp4, \tmp6
|
||||
xvpickev.d \out2, \tmp5, \tmp7
|
||||
xvpickod.d \out3, \tmp5, \tmp7
|
||||
|
||||
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
xvpickev.d \out4, \tmp4, \tmp6
|
||||
xvpickod.d \out5, \tmp4, \tmp6
|
||||
xvpickev.d \out6, \tmp5, \tmp7
|
||||
xvpickod.d \out7, \tmp5, \tmp7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 2x4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1, tmp2
|
||||
xvilvh.h \tmp1, \in0, \in1
|
||||
xvilvl.h \out1, \in0, \in1
|
||||
xvilvh.h \tmp0, \in2, \in3
|
||||
xvilvl.h \out3, \in2, \in3
|
||||
|
||||
xvilvh.w \tmp2, \out3, \out1
|
||||
xvilvl.w \out3, \out3, \out1
|
||||
|
||||
xvilvl.w \out2, \tmp0, \tmp1
|
||||
xvilvh.w \tmp1, \tmp0, \tmp1
|
||||
|
||||
xvilvh.d \out0, \out2, \out3
|
||||
xvilvl.d \out2, \out2, \out3
|
||||
xvilvh.d \out1, \tmp1, \tmp2
|
||||
xvilvl.d \out3, \tmp1, \tmp2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Details :
|
||||
* Example :
|
||||
* 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
|
||||
* 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
|
||||
* 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
|
||||
* 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
|
||||
xvilvl.w \tmp0, \in1, \in0
|
||||
xvilvh.w \out1, \in1, \in0
|
||||
xvilvl.w \tmp1, \in3, \in2
|
||||
xvilvh.w \out3, \in3, \in2
|
||||
|
||||
xvilvl.d \out0, \tmp1, \tmp0
|
||||
xvilvl.d \out2, \out3, \out1
|
||||
xvilvh.d \out3, \out3, \out1
|
||||
xvilvh.d \out1, \tmp1, \tmp0
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6,
|
||||
* _out7
|
||||
* Example : LASX_TRANSPOSE8x8_W
|
||||
* in0 : 1,2,3,4,5,6,7,8
|
||||
* in1 : 2,2,3,4,5,6,7,8
|
||||
* in2 : 3,2,3,4,5,6,7,8
|
||||
* in3 : 4,2,3,4,5,6,7,8
|
||||
* in4 : 5,2,3,4,5,6,7,8
|
||||
* in5 : 6,2,3,4,5,6,7,8
|
||||
* in6 : 7,2,3,4,5,6,7,8
|
||||
* in7 : 8,2,3,4,5,6,7,8
|
||||
*
|
||||
* out0 : 1,2,3,4,5,6,7,8
|
||||
* out1 : 2,2,2,2,2,2,2,2
|
||||
* out2 : 3,3,3,3,3,3,3,3
|
||||
* out3 : 4,4,4,4,4,4,4,4
|
||||
* out4 : 5,5,5,5,5,5,5,5
|
||||
* out5 : 6,6,6,6,6,6,6,6
|
||||
* out6 : 7,7,7,7,7,7,7,7
|
||||
* out7 : 8,8,8,8,8,8,8,8
|
||||
*/
|
||||
.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
|
||||
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||
tmp0, tmp1, tmp2, tmp3
|
||||
xvilvl.w \tmp0, \in2, \in0
|
||||
xvilvl.w \tmp1, \in3, \in1
|
||||
xvilvh.w \tmp2, \in2, \in0
|
||||
xvilvh.w \tmp3, \in3, \in1
|
||||
xvilvl.w \out0, \tmp1, \tmp0
|
||||
xvilvh.w \out1, \tmp1, \tmp0
|
||||
xvilvl.w \out2, \tmp3, \tmp2
|
||||
xvilvh.w \out3, \tmp3, \tmp2
|
||||
|
||||
xvilvl.w \tmp0, \in6, \in4
|
||||
xvilvl.w \tmp1, \in7, \in5
|
||||
xvilvh.w \tmp2, \in6, \in4
|
||||
xvilvh.w \tmp3, \in7, \in5
|
||||
xvilvl.w \out4, \tmp1, \tmp0
|
||||
xvilvh.w \out5, \tmp1, \tmp0
|
||||
xvilvl.w \out6, \tmp3, \tmp2
|
||||
xvilvh.w \out7, \tmp3, \tmp2
|
||||
|
||||
xmov \tmp0, \out0
|
||||
xmov \tmp1, \out1
|
||||
xmov \tmp2, \out2
|
||||
xmov \tmp3, \out3
|
||||
xvpermi.q \out0, \out4, 0x02
|
||||
xvpermi.q \out1, \out5, 0x02
|
||||
xvpermi.q \out2, \out6, 0x02
|
||||
xvpermi.q \out3, \out7, 0x02
|
||||
xvpermi.q \out4, \tmp0, 0x31
|
||||
xvpermi.q \out5, \tmp1, 0x31
|
||||
xvpermi.q \out6, \tmp2, 0x31
|
||||
xvpermi.q \out7, \tmp3, 0x31
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with double-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Example : LASX_TRANSPOSE4x4_D
|
||||
* in0 : 1,2,3,4
|
||||
* in1 : 1,2,3,4
|
||||
* in2 : 1,2,3,4
|
||||
* in3 : 1,2,3,4
|
||||
*
|
||||
* out0 : 1,1,1,1
|
||||
* out1 : 2,2,2,2
|
||||
* out2 : 3,3,3,3
|
||||
* out3 : 4,4,4,4
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
xvilvl.d \tmp0, \in1, \in0
|
||||
xvilvh.d \out1, \in1, \in0
|
||||
xvilvh.d \tmp1, \in3, \in2
|
||||
xvilvl.d \out2, \in3, \in2
|
||||
|
||||
xvor.v \out0, \tmp0, \tmp0
|
||||
xvor.v \out3, \tmp1, \tmp1
|
||||
|
||||
xvpermi.q \out0, \out2, 0x02
|
||||
xvpermi.q \out2, \tmp0, 0x31
|
||||
xvpermi.q \out3, \out1, 0x31
|
||||
xvpermi.q \out1, \tmp1, 0x02
|
||||
.endm
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H
|
||||
#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx));
|
||||
|
||||
static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx);
|
||||
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx);
|
||||
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx);
|
||||
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
|
||||
#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
|
||||
|
||||
#include "common/intops.h"
|
||||
#include "src/cpu.h"
|
||||
#include "src/looprestoration.h"
|
||||
|
||||
void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride,
|
||||
const uint8_t (*const left)[4],
|
||||
const uint8_t *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc)
|
||||
{
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx;
|
||||
|
||||
c->sgr[0] = dav1d_sgr_filter_5x5_lsx;
|
||||
c->sgr[1] = dav1d_sgr_filter_3x3_lsx;
|
||||
c->sgr[2] = dav1d_sgr_filter_mix_lsx;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */
|
|
@ -0,0 +1,274 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/loongarch/looprestoration.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
|
||||
#define REST_UNIT_STRIDE (400)
|
||||
|
||||
void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
|
||||
uint8_t *tmp_ptr,
|
||||
const int16_t filterh[8],
|
||||
const int w, const int h);
|
||||
|
||||
void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
|
||||
const ptrdiff_t p_stride,
|
||||
const int32_t *hor,
|
||||
const int16_t filterv[8],
|
||||
const int w, const int h);
|
||||
|
||||
// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
|
||||
static inline void padding(uint8_t *dst, const uint8_t *p,
|
||||
const ptrdiff_t stride, const uint8_t (*left)[4],
|
||||
const uint8_t *lpf, int unit_w, const int stripe_h,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
const int have_left = !!(edges & LR_HAVE_LEFT);
|
||||
const int have_right = !!(edges & LR_HAVE_RIGHT);
|
||||
|
||||
// Copy more pixels if we don't have to pad them
|
||||
unit_w += 3 * have_left + 3 * have_right;
|
||||
uint8_t *dst_l = dst + 3 * !have_left;
|
||||
p -= 3 * have_left;
|
||||
lpf -= 3 * have_left;
|
||||
|
||||
if (edges & LR_HAVE_TOP) {
|
||||
// Copy previous loop filtered rows
|
||||
const uint8_t *const above_1 = lpf;
|
||||
const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
|
||||
pixel_copy(dst_l, above_1, unit_w);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
|
||||
} else {
|
||||
// Pad with first row
|
||||
pixel_copy(dst_l, p, unit_w);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
|
||||
if (have_left) {
|
||||
pixel_copy(dst_l, &left[0][1], 3);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
|
||||
if (edges & LR_HAVE_BOTTOM) {
|
||||
// Copy next loop filtered rows
|
||||
const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
|
||||
const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
|
||||
} else {
|
||||
// Pad with last row
|
||||
const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
|
||||
if (have_left) {
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
}
|
||||
}
|
||||
|
||||
// Inner UNIT_WxSTRIPE_H
|
||||
for (int j = 0; j < stripe_h; j++) {
|
||||
pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
|
||||
dst_tl += REST_UNIT_STRIDE;
|
||||
p += PXSTRIDE(stride);
|
||||
}
|
||||
|
||||
if (!have_right) {
|
||||
uint8_t *pad = dst_l + unit_w;
|
||||
uint8_t *row_last = &dst_l[unit_w - 1];
|
||||
// Pad 3x(STRIPE_H+6) with last column
|
||||
for (int j = 0; j < stripe_h + 6; j++) {
|
||||
pixel_set(pad, *row_last, 3);
|
||||
pad += REST_UNIT_STRIDE;
|
||||
row_last += REST_UNIT_STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!have_left) {
|
||||
// Pad 3x(STRIPE_H+6) with first column
|
||||
for (int j = 0; j < stripe_h + 6; j++) {
|
||||
pixel_set(dst, *dst_l, 3);
|
||||
dst += REST_UNIT_STRIDE;
|
||||
dst_l += REST_UNIT_STRIDE;
|
||||
}
|
||||
} else {
|
||||
dst += 3 * REST_UNIT_STRIDE;
|
||||
for (int j = 0; j < stripe_h; j++) {
|
||||
pixel_copy(dst, &left[j][1], 3);
|
||||
dst += REST_UNIT_STRIDE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
|
||||
|
||||
// FIXME Could split into luma and chroma specific functions,
|
||||
// (since first and last tops are always 0 for chroma)
|
||||
// FIXME Could implement a version that requires less temporary memory
|
||||
// (should be possible to implement with only 6 rows of temp storage)
|
||||
void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
|
||||
const uint8_t (*const left)[4],
|
||||
const uint8_t *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int16_t (*const filter)[8] = params->filter;
|
||||
|
||||
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
|
||||
// of padding above and below
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
padding(tmp, p, p_stride, left, lpf, w, h, edges);
|
||||
ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
|
||||
|
||||
BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
|
||||
BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
|
||||
}
|
||||
|
||||
void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
|
||||
const int w, const int h);
|
||||
void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h);
|
||||
|
||||
void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h, const int w1);
|
||||
void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
|
||||
int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h);
|
||||
void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
|
||||
int16_t *dst, int w1,
|
||||
const int w, const int h);
|
||||
|
||||
|
||||
static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
|
||||
const int w, const int h)
|
||||
{
|
||||
BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
|
||||
BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
|
||||
}
|
||||
|
||||
void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
padding(tmp, p, p_stride, left, lpf, w, h, edges);
|
||||
coef dst[64 * 384];
|
||||
|
||||
ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
|
||||
ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
|
||||
|
||||
boxsum3_lsx(sumsq, sum, tmp, w, h);
|
||||
BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
|
||||
BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
|
||||
BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
|
||||
}
|
||||
|
||||
void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
|
||||
const uint8_t *const src,
|
||||
const int w, const int h);
|
||||
|
||||
void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h);
|
||||
|
||||
void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const unsigned s);
|
||||
|
||||
void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
|
||||
int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h);
|
||||
|
||||
void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
|
||||
const int16_t *dst0, const int16_t *dst1,
|
||||
const int w0, const int w1,
|
||||
const int w, const int h);
|
||||
|
||||
static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
|
||||
const int w, const int h)
|
||||
{
|
||||
BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
|
||||
BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
|
||||
}
|
||||
|
||||
void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
padding(tmp, p, p_stride, left, lpf, w, h, edges);
|
||||
coef dst[64 * 384];
|
||||
|
||||
ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
|
||||
ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
|
||||
|
||||
boxsum5_lsx(sumsq, sum, tmp, w, h);
|
||||
BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
|
||||
BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
|
||||
BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
padding(tmp, p, p_stride, left, lpf, w, h, edges);
|
||||
coef dst0[64 * 384];
|
||||
coef dst1[64 * 384];
|
||||
|
||||
ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
|
||||
ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
|
||||
|
||||
boxsum5_lsx(sumsq0, sum0, tmp, w, h);
|
||||
BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
|
||||
BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
|
||||
|
||||
boxsum3_lsx(sumsq0, sum0, tmp, w, h);
|
||||
BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
|
||||
BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
|
||||
|
||||
BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
|
||||
params->sgr.w1, w, h);
|
||||
}
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_MC_H
|
||||
#define DAV1D_SRC_LOONGARCH_MC_H
|
||||
|
||||
#include "config.h"
|
||||
#include "src/mc.h"
|
||||
#include "src/cpu.h"
|
||||
|
||||
#define init_mc_fn(type, name, suffix) \
|
||||
c->mc[type] = BF(dav1d_put_##name, suffix)
|
||||
#define init_mct_fn(type, name, suffix) \
|
||||
c->mct[type] = BF(dav1d_prep_##name, suffix)
|
||||
|
||||
decl_avg_fn(BF(dav1d_avg, lsx));
|
||||
decl_w_avg_fn(BF(dav1d_w_avg, lsx));
|
||||
decl_mask_fn(BF(dav1d_mask, lsx));
|
||||
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
|
||||
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
|
||||
decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
|
||||
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx));
|
||||
|
||||
decl_avg_fn(BF(dav1d_avg, lasx));
|
||||
decl_w_avg_fn(BF(dav1d_w_avg, lasx));
|
||||
decl_mask_fn(BF(dav1d_mask, lasx));
|
||||
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
|
||||
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
|
||||
decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
|
||||
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx));
|
||||
|
||||
static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
|
||||
#if BITDEPTH == 8
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
|
||||
|
||||
c->avg = BF(dav1d_avg, lsx);
|
||||
c->w_avg = BF(dav1d_w_avg, lsx);
|
||||
c->mask = BF(dav1d_mask, lsx);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
|
||||
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
|
||||
|
||||
c->avg = BF(dav1d_avg, lasx);
|
||||
c->w_avg = BF(dav1d_w_avg, lasx);
|
||||
c->mask = BF(dav1d_mask, lasx);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
|
||||
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_MC_H */
|
|
@ -0,0 +1,368 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "loongson_asm.S"
|
||||
|
||||
const min_prob
|
||||
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
endconst
|
||||
|
||||
.macro decode_symbol_adapt w
|
||||
addi.d sp, sp, -48
|
||||
addi.d a4, a0, 24
|
||||
vldrepl.h vr0, a4, 0 //rng
|
||||
fst.s f0, sp, 0 //val==0
|
||||
vld vr1, a1, 0 //cdf
|
||||
.if \w == 16
|
||||
li.w t4, 16
|
||||
vldx vr11, a1, t4
|
||||
.endif
|
||||
addi.d a6, a0, 16
|
||||
vldrepl.d vr2, a6, 0 //dif
|
||||
addi.d t0, a0, 32
|
||||
ld.w t1, t0, 0 //allow_update_cdf
|
||||
la.local t2, min_prob
|
||||
addi.d t2, t2, 32
|
||||
addi.w t3, a2, 1
|
||||
slli.w t3, t3, 1
|
||||
sub.d t2, t2, t3
|
||||
vld vr3, t2, 0 //min_prob
|
||||
.if \w == 16
|
||||
vldx vr13, t2, t4
|
||||
.endif
|
||||
vsrli.h vr4, vr0, 8 //r = s->rng >> 8
|
||||
vslli.h vr4, vr4, 8 //r << 8
|
||||
vsrli.h vr5, vr1, 6
|
||||
vslli.h vr5, vr5, 7
|
||||
.if \w == 16
|
||||
vsrli.h vr15, vr11, 6
|
||||
vslli.h vr15, vr15, 7
|
||||
.endif
|
||||
vmuh.hu vr5, vr4, vr5
|
||||
vadd.h vr5, vr5, vr3 //v
|
||||
.if \w == 16
|
||||
vmuh.hu vr15, vr4, vr15
|
||||
vadd.h vr15, vr15, vr13
|
||||
.endif
|
||||
addi.d t8, sp, 4
|
||||
vst vr5, t8, 0 //store v
|
||||
.if \w == 16
|
||||
vstx vr15, t8, t4
|
||||
.endif
|
||||
vreplvei.h vr20, vr2, 3 //c
|
||||
vssub.hu vr6, vr5, vr20 //c >=v
|
||||
vseqi.h vr6, vr6, 0
|
||||
.if \w == 16
|
||||
vssub.hu vr16, vr15, vr20 //c >=v
|
||||
vseqi.h vr16, vr16, 0
|
||||
vpickev.b vr21, vr16, vr6
|
||||
.endif
|
||||
.if \w <= 8
|
||||
vmskltz.h vr10, vr6
|
||||
.else
|
||||
vmskltz.b vr10, vr21
|
||||
.endif
|
||||
beqz t1, .renorm\()\w
|
||||
|
||||
// update_cdf
|
||||
alsl.d t1, a2, a1, 1
|
||||
ld.h t2, t1, 0 //count
|
||||
srli.w t3, t2, 4 //count >> 4
|
||||
addi.w t3, t3, 4
|
||||
li.w t5, 2
|
||||
sltu t5, t5, a2
|
||||
add.w t3, t3, t5 //rate
|
||||
sltui t5, t2, 32
|
||||
add.w t2, t2, t5 //count + (count < 32)
|
||||
vreplgr2vr.h vr9, t3
|
||||
vseq.h vr7, vr7, vr7
|
||||
vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768
|
||||
vsub.h vr5, vr5, vr1
|
||||
vsub.h vr8, vr1, vr6
|
||||
.if \w == 16
|
||||
vavgr.hu vr15, vr16, vr7
|
||||
vsub.h vr15, vr15, vr11
|
||||
vsub.h vr18, vr11, vr16
|
||||
.endif
|
||||
vsra.h vr5, vr5, vr9
|
||||
vadd.h vr8, vr8, vr5
|
||||
.if \w == 4
|
||||
fst.d f8, a1, 0
|
||||
.else
|
||||
vst vr8, a1, 0
|
||||
.endif
|
||||
.if \w == 16
|
||||
vsra.h vr15, vr15, vr9
|
||||
vadd.h vr18, vr18, vr15
|
||||
vstx vr18, a1, t4
|
||||
.endif
|
||||
st.h t2, t1, 0
|
||||
|
||||
.renorm\()\w:
|
||||
vpickve2gr.h t3, vr10, 0
|
||||
ctz.w a7, t3 // ret
|
||||
alsl.d t3, a7, t8, 1
|
||||
ld.hu t4, t3, 0 // v
|
||||
addi.d t3, t3, -2
|
||||
ld.hu t5, t3, 0 // u
|
||||
sub.w t5, t5, t4 // rng
|
||||
slli.d t4, t4, 48
|
||||
vpickve2gr.d t6, vr2, 0
|
||||
sub.d t6, t6, t4 // dif
|
||||
addi.d t6, t6, 1
|
||||
clz.w t4, t5 // d
|
||||
xori t4, t4, 16 // d
|
||||
sll.d t6, t6, t4
|
||||
addi.d t6, t6, -1 // dif
|
||||
addi.d a5, a0, 28 // cnt
|
||||
ld.w t7, a5, 0
|
||||
sub.w t7, t7, t4 // cnt-d
|
||||
sll.w t5, t5, t4
|
||||
st.w t5, a4, 0 // store rng
|
||||
bge t7, zero, 9f
|
||||
|
||||
// refill
|
||||
ld.d t0, a0, 0 // buf_pos
|
||||
addi.d t1, a0, 8
|
||||
ld.d t1, t1, 0 // buf_end
|
||||
addi.d t2, t0, 8
|
||||
blt t1, t2, 1f
|
||||
|
||||
ld.d t0, t0, 0 // next_bits
|
||||
addi.w t3, t7, 23 // shift_bits = cnt + 23
|
||||
addi.w t7, t7, 16 // cnt += 16
|
||||
revb.d t0, t0 // next_bits = bswap(next_bits)
|
||||
srli.w t4, t3, 3
|
||||
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
|
||||
st.d t2, a0, 0
|
||||
andi t3, t3, 24 // shift_bits &= 24
|
||||
srl.d t0, t0, t3 // next_bits >>= shift_bits
|
||||
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
|
||||
sll.d t0, t0, t3 // next_bits <<= shift_bits
|
||||
li.w t5, 48
|
||||
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
|
||||
xor t6, t6, t0 // dif ^= next_bits
|
||||
b 9f
|
||||
1:
|
||||
li.w t4, 40
|
||||
sub.w t5, t4, t7 // c = 40 - cnt
|
||||
2:
|
||||
bge t0, t1, 3f
|
||||
ld.bu t2, t0, 0
|
||||
addi.d t0, t0, 1
|
||||
sll.d t2, t2, t5
|
||||
xor t6, t6, t2
|
||||
addi.w t5, t5, -8
|
||||
bge t5, zero, 2b
|
||||
// refill_eob_end
|
||||
3:
|
||||
st.d t0, a0, 0 // s->buf_pos = buf_pos
|
||||
sub.w t7, t4, t5 // cnt = 40 - c
|
||||
9:
|
||||
st.w t7, a5, 0 // store cnt
|
||||
st.d t6, a6, 0 // store dif
|
||||
move a0, a7
|
||||
addi.d sp, sp, 48
|
||||
.endm
|
||||
|
||||
function msac_decode_symbol_adapt4_lsx
|
||||
decode_symbol_adapt 4
|
||||
endfunc
|
||||
|
||||
function msac_decode_symbol_adapt8_lsx
|
||||
decode_symbol_adapt 8
|
||||
endfunc
|
||||
|
||||
function msac_decode_symbol_adapt16_lsx
|
||||
decode_symbol_adapt 16
|
||||
endfunc
|
||||
|
||||
function msac_decode_bool_lsx
|
||||
ld.w t0, a0, 24 // rng
|
||||
srli.w a1, a1, 6
|
||||
ld.d t1, a0, 16 // dif
|
||||
srli.w t2, t0, 8 // r >> 8
|
||||
mul.w t2, t2, a1
|
||||
ld.w a5, a0, 28 // cnt
|
||||
addi.d t1, t1, 1 // dif + 1
|
||||
srli.w t2, t2, 1
|
||||
addi.w t2, t2, 4 // v
|
||||
slli.d t3, t2, 48 // vw
|
||||
sltu t4, t1, t3
|
||||
move t8, t4 // ret
|
||||
xori t4, t4, 1
|
||||
maskeqz t6, t3, t4 // if (ret) vw
|
||||
sub.d t6, t1, t6 // dif
|
||||
slli.w t5, t2, 1
|
||||
sub.w t5, t0, t5 // r - 2v
|
||||
maskeqz t7, t5, t4 // if (ret) r - 2v
|
||||
add.w t5, t2, t7 // v(rng)
|
||||
|
||||
// renorm
|
||||
clz.w t4, t5 // d
|
||||
xori t4, t4, 16 // d
|
||||
sll.d t6, t6, t4
|
||||
addi.d t6, t6, -1 // dif
|
||||
sub.w t7, a5, t4 // cnt-d
|
||||
sll.w t5, t5, t4
|
||||
st.w t5, a0, 24 // store rng
|
||||
bge t7, zero, 9f
|
||||
|
||||
// refill
|
||||
ld.d t0, a0, 0 // buf_pos
|
||||
addi.d t1, a0, 8
|
||||
ld.d t1, t1, 0 // buf_end
|
||||
addi.d t2, t0, 8
|
||||
blt t1, t2, 1f
|
||||
|
||||
ld.d t0, t0, 0 // next_bits
|
||||
addi.w t3, t7, 23 // shift_bits = cnt + 23
|
||||
addi.w t7, t7, 16 // cnt += 16
|
||||
revb.d t0, t0 // next_bits = bswap(next_bits)
|
||||
srli.w t4, t3, 3
|
||||
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
|
||||
st.d t2, a0, 0
|
||||
andi t3, t3, 24 // shift_bits &= 24
|
||||
srl.d t0, t0, t3 // next_bits >>= shift_bits
|
||||
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
|
||||
sll.d t0, t0, t3 // next_bits <<= shift_bits
|
||||
li.w t5, 48
|
||||
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
|
||||
xor t6, t6, t0 // dif ^= next_bits
|
||||
b 9f
|
||||
1:
|
||||
li.w t4, 40
|
||||
sub.w t5, t4, t7 // c = 40 - cnt
|
||||
2:
|
||||
bge t0, t1, 3f
|
||||
ld.bu t2, t0, 0
|
||||
addi.d t0, t0, 1
|
||||
sll.d t2, t2, t5
|
||||
xor t6, t6, t2
|
||||
addi.w t5, t5, -8
|
||||
bge t5, zero, 2b
|
||||
// refill_eob_end
|
||||
3:
|
||||
st.d t0, a0, 0 // s->buf_pos = buf_pos
|
||||
sub.w t7, t4, t5 // cnt = 40 - c
|
||||
9:
|
||||
st.w t7, a0, 28 // store cnt
|
||||
st.d t6, a0, 16 // store dif
|
||||
move a0, t8
|
||||
endfunc
|
||||
|
||||
function msac_decode_bool_adapt_lsx
|
||||
ld.hu a3, a1, 0 // cdf[0] /f
|
||||
ld.w t0, a0, 24 // rng
|
||||
ld.d t1, a0, 16 // dif
|
||||
srli.w t2, t0, 8 // r >> 8
|
||||
srli.w a7, a3, 6
|
||||
mul.w t2, t2, a7
|
||||
ld.w a4, a0, 32 // allow_update_cdf
|
||||
ld.w a5, a0, 28 // cnt
|
||||
srli.w t2, t2, 1
|
||||
addi.w t2, t2, 4 // v
|
||||
slli.d t3, t2, 48 // vw
|
||||
sltu t4, t1, t3
|
||||
move t8, t4 // bit
|
||||
xori t4, t4, 1
|
||||
maskeqz t6, t3, t4 // if (ret) vw
|
||||
sub.d t6, t1, t6 // dif
|
||||
slli.w t5, t2, 1
|
||||
sub.w t5, t0, t5 // r - 2v
|
||||
maskeqz t7, t5, t4 // if (ret) r - 2v
|
||||
add.w t5, t2, t7 // v(rng)
|
||||
beqz a4, .renorm
|
||||
|
||||
// update_cdf
|
||||
ld.hu t0, a1, 2 // cdf[1]
|
||||
srli.w t1, t0, 4
|
||||
addi.w t1, t1, 4 // rate
|
||||
sltui t2, t0, 32 // count < 32
|
||||
add.w t0, t0, t2 // count + (count < 32)
|
||||
sub.w a3, a3, t8 // cdf[0] -= bit
|
||||
slli.w t4, t8, 15
|
||||
sub.w t7, a3, t4 // cdf[0] - bit - 32768
|
||||
sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate
|
||||
sub.w t7, a3, t7 // cdf[0]
|
||||
st.h t7, a1, 0
|
||||
st.h t0, a1, 2
|
||||
|
||||
.renorm:
|
||||
// renorm
|
||||
addi.d t6, t6, 1
|
||||
clz.w t4, t5 // d
|
||||
xori t4, t4, 16 // d
|
||||
sll.d t6, t6, t4
|
||||
addi.d t6, t6, -1 // dif
|
||||
sub.w t7, a5, t4 // cnt-d
|
||||
sll.w t5, t5, t4
|
||||
st.w t5, a0, 24 // store rng
|
||||
bge t7, zero, 9f
|
||||
|
||||
// refill
|
||||
ld.d t0, a0, 0 // buf_pos
|
||||
addi.d t1, a0, 8
|
||||
ld.d t1, t1, 0 // buf_end
|
||||
addi.d t2, t0, 8
|
||||
blt t1, t2, 1f
|
||||
|
||||
ld.d t0, t0, 0 // next_bits
|
||||
addi.w t3, t7, 23 // shift_bits = cnt + 23
|
||||
addi.w t7, t7, 16 // cnt += 16
|
||||
revb.d t0, t0 // next_bits = bswap(next_bits)
|
||||
srli.w t4, t3, 3
|
||||
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
|
||||
st.d t2, a0, 0
|
||||
andi t3, t3, 24 // shift_bits &= 24
|
||||
srl.d t0, t0, t3 // next_bits >>= shift_bits
|
||||
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
|
||||
sll.d t0, t0, t3 // next_bits <<= shift_bits
|
||||
li.w t5, 48
|
||||
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
|
||||
xor t6, t6, t0 // dif ^= next_bits
|
||||
b 9f
|
||||
1:
|
||||
li.w t4, 40
|
||||
sub.w t5, t4, t7 // c = 40 - cnt
|
||||
2:
|
||||
bge t0, t1, 3f
|
||||
ld.bu t2, t0, 0
|
||||
addi.d t0, t0, 1
|
||||
sll.d t2, t2, t5
|
||||
xor t6, t6, t2
|
||||
addi.w t5, t5, -8
|
||||
bge t5, zero, 2b
|
||||
// refill_eob_end
|
||||
3:
|
||||
st.d t0, a0, 0 // s->buf_pos = buf_pos
|
||||
sub.w t7, t4, t5 // cnt = 40 - c
|
||||
9:
|
||||
st.w t7, a0, 28 // store cnt
|
||||
st.d t6, a0, 16 // store dif
|
||||
move a0, t8
|
||||
endfunc
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_MSAC_H
|
||||
#define DAV1D_SRC_LOONGARCH_MSAC_H
|
||||
|
||||
unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
|
||||
unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
|
||||
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
|
||||
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx
|
||||
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/loongarch/loongson_asm.S"
|
||||
|
||||
/*
|
||||
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
|
||||
const int bx4, const int bw4, int bh4)
|
||||
*/
|
||||
|
||||
function splat_mv_lsx
|
||||
vld vr0, a1, 0 // 0 1 ... 11 ...
|
||||
clz.w t4, a3
|
||||
vaddi.bu vr1, vr0, 0
|
||||
addi.w t4, t4, -26
|
||||
vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3
|
||||
la.local t5, .SPLAT_LSX_JRTABLE
|
||||
vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0
|
||||
alsl.d t6, t4, t5, 1
|
||||
vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7
|
||||
ld.h t7, t6, 0
|
||||
vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
|
||||
add.d t8, t5, t7
|
||||
alsl.d a2, a2, a2, 1
|
||||
vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
slli.w a2, a2, 2
|
||||
jirl $r0, t8, 0
|
||||
|
||||
.SPLAT_LSX_JRTABLE:
|
||||
.hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
|
||||
.hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
|
||||
.hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE
|
||||
.hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE
|
||||
.hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE
|
||||
.hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE
|
||||
|
||||
.SPLAT_W1_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
fst.d f1, t3, 0
|
||||
fst.s f3, t3, 8
|
||||
blt zero, a4, .SPLAT_W1_LSX
|
||||
b .splat_end
|
||||
.SPLAT_W2_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
vst vr1, t3, 0
|
||||
fst.d f2, t3, 16
|
||||
blt zero, a4, .SPLAT_W2_LSX
|
||||
b .splat_end
|
||||
|
||||
.SPLAT_W4_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
vst vr1, t3, 0
|
||||
vst vr2, t3, 16
|
||||
vst vr3, t3, 32
|
||||
blt zero, a4, .SPLAT_W4_LSX
|
||||
b .splat_end
|
||||
|
||||
.SPLAT_W8_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
vst vr1, t3, 0
|
||||
vst vr2, t3, 16
|
||||
vst vr3, t3, 32
|
||||
|
||||
vst vr1, t3, 48
|
||||
vst vr2, t3, 64
|
||||
vst vr3, t3, 80
|
||||
blt zero, a4, .SPLAT_W8_LSX
|
||||
b .splat_end
|
||||
|
||||
.SPLAT_W16_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
.rept 2
|
||||
vst vr1, t3, 0
|
||||
vst vr2, t3, 16
|
||||
vst vr3, t3, 32
|
||||
|
||||
vst vr1, t3, 48
|
||||
vst vr2, t3, 64
|
||||
vst vr3, t3, 80
|
||||
|
||||
addi.d t3, t3, 96
|
||||
.endr
|
||||
|
||||
blt zero, a4, .SPLAT_W16_LSX
|
||||
b .splat_end
|
||||
|
||||
.SPLAT_W32_LSX:
|
||||
ld.d t3, a0, 0
|
||||
addi.d a0, a0, 8
|
||||
addi.d a4, a4, -1
|
||||
add.d t3, t3, a2
|
||||
|
||||
.rept 4
|
||||
vst vr1, t3, 0
|
||||
vst vr2, t3, 16
|
||||
vst vr3, t3, 32
|
||||
|
||||
vst vr1, t3, 48
|
||||
vst vr2, t3, 64
|
||||
vst vr3, t3, 80
|
||||
|
||||
addi.d t3, t3, 96
|
||||
.endr
|
||||
|
||||
blt zero, a4, .SPLAT_W32_LSX
|
||||
|
||||
.splat_end:
|
||||
endfunc
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Loongson Technology Corporation Limited
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H
|
||||
#define DAV1D_SRC_LOONGARCH_REFMVS_H
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
decl_splat_mv_fn(dav1d_splat_mv_lsx);
|
||||
|
||||
static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
|
||||
|
||||
c->splat_mv = dav1d_splat_mv_lsx;
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */
|
|
@ -247,6 +247,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/loopfilter.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/loopfilter.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/loopfilter.h"
|
||||
#endif
|
||||
|
@ -261,6 +263,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
loop_filter_dsp_init_arm(c);
|
||||
#elif ARCH_LOONGARCH64
|
||||
loop_filter_dsp_init_loongarch(c);
|
||||
#elif ARCH_X86
|
||||
loop_filter_dsp_init_x86(c);
|
||||
#endif
|
||||
|
|
|
@ -527,6 +527,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/looprestoration.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/looprestoration.h"
|
||||
#elif ARCH_PPC64LE
|
||||
#include "src/ppc/looprestoration.h"
|
||||
#elif ARCH_X86
|
||||
|
@ -545,6 +547,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
loop_restoration_dsp_init_arm(c, bpc);
|
||||
#elif ARCH_LOONGARCH64
|
||||
loop_restoration_dsp_init_loongarch(c, bpc);
|
||||
#elif ARCH_PPC64LE
|
||||
loop_restoration_dsp_init_ppc(c, bpc);
|
||||
#elif ARCH_X86
|
||||
|
|
|
@ -905,6 +905,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/mc.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/mc.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/mc.h"
|
||||
#endif
|
||||
|
@ -946,6 +948,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
mc_dsp_init_arm(c);
|
||||
#elif ARCH_LOONGARCH64
|
||||
mc_dsp_init_loongarch(c);
|
||||
#elif ARCH_X86
|
||||
mc_dsp_init_x86(c);
|
||||
#endif
|
||||
|
|
|
@ -226,6 +226,24 @@ if is_asm_enabled
|
|||
|
||||
# Compile the ASM sources with NASM
|
||||
libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
elif host_machine.cpu_family().startswith('loongarch')
|
||||
libdav1d_sources += files(
|
||||
'loongarch/cpu.c',
|
||||
)
|
||||
|
||||
libdav1d_arch_tmpl_sources += files(
|
||||
'loongarch/looprestoration_tmpl.c',
|
||||
)
|
||||
|
||||
libdav1d_sources_asm = files(
|
||||
'loongarch/mc.S',
|
||||
'loongarch/loopfilter.S',
|
||||
'loongarch/looprestoration.S',
|
||||
'loongarch/msac.S',
|
||||
'loongarch/refmvs.S',
|
||||
'loongarch/itx.S',
|
||||
)
|
||||
libdav1d_asm_objs += libdav1d_sources_asm
|
||||
elif host_machine.cpu() == 'ppc64le'
|
||||
arch_flags = ['-maltivec', '-mvsx']
|
||||
libdav1d_sources += files(
|
||||
|
@ -235,6 +253,15 @@ if is_asm_enabled
|
|||
'ppc/cdef_tmpl.c',
|
||||
'ppc/looprestoration_tmpl.c',
|
||||
)
|
||||
elif host_machine.cpu_family().startswith('riscv')
|
||||
libdav1d_sources += files(
|
||||
'riscv/cpu.c',
|
||||
)
|
||||
if host_machine.cpu_family() == 'riscv64'
|
||||
libdav1d_sources += files(
|
||||
'riscv/64/itx.S',
|
||||
)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -51,6 +51,8 @@ typedef struct MsacContext {
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/msac.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/msac.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/msac.h"
|
||||
#endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -919,6 +919,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/refmvs.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/refmvs.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/refmvs.h"
|
||||
#endif
|
||||
|
@ -933,6 +935,8 @@ COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
refmvs_dsp_init_arm(c);
|
||||
#elif ARCH_LOONGARCH64
|
||||
refmvs_dsp_init_loongarch(c);
|
||||
#elif ARCH_X86
|
||||
refmvs_dsp_init_x86(c);
|
||||
#endif
|
||||
|
|
|
@ -171,6 +171,7 @@ void dav1d_refmvs_find(const refmvs_tile *rt,
|
|||
|
||||
void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
|
||||
void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
|
||||
void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp);
|
||||
void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
|
||||
|
||||
#endif /* DAV1D_SRC_REF_MVS_H */
|
||||
|
|
|
@ -0,0 +1,662 @@
|
|||
/******************************************************************************
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "src/riscv/asm.S"
|
||||
|
||||
function inv_txfm_add_4x4_rvv, export=1, ext=v
|
||||
csrw vxrm, zero
|
||||
|
||||
vsetivli zero, 4, e16, mf2, ta, ma
|
||||
vle16.v v0, (a2)
|
||||
addi t0, a2, 8
|
||||
vle16.v v1, (t0)
|
||||
addi t0, t0, 8
|
||||
vle16.v v2, (t0)
|
||||
addi t0, t0, 8
|
||||
vle16.v v3, (t0)
|
||||
|
||||
jalr t0, a4
|
||||
|
||||
vmv.v.x v4, zero
|
||||
|
||||
vsseg4e16.v v0, (a2)
|
||||
vle16.v v0, (a2)
|
||||
vse16.v v4, (a2)
|
||||
addi t0, a2, 8
|
||||
vle16.v v1, (t0)
|
||||
vse16.v v4, (t0)
|
||||
addi t0, t0, 8
|
||||
vle16.v v2, (t0)
|
||||
vse16.v v4, (t0)
|
||||
addi t0, t0, 8
|
||||
vle16.v v3, (t0)
|
||||
vse16.v v4, (t0)
|
||||
|
||||
jalr t0, a5
|
||||
|
||||
vssra.vi v0, v0, 4
|
||||
vssra.vi v1, v1, 4
|
||||
vssra.vi v2, v2, 4
|
||||
vssra.vi v3, v3, 4
|
||||
|
||||
itx_4x4_end:
|
||||
vsetvli zero, zero, e8, mf4, ta, ma
|
||||
vle8.v v4, (a0)
|
||||
add t0, a0, a1
|
||||
vle8.v v5, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v6, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v7, (t0)
|
||||
|
||||
vwaddu.wv v0, v0, v4
|
||||
vwaddu.wv v1, v1, v5
|
||||
vwaddu.wv v2, v2, v6
|
||||
vwaddu.wv v3, v3, v7
|
||||
|
||||
vsetvli zero, zero, e16, mf2, ta, ma
|
||||
vmax.vx v0, v0, zero
|
||||
vmax.vx v1, v1, zero
|
||||
vmax.vx v2, v2, zero
|
||||
vmax.vx v3, v3, zero
|
||||
|
||||
vsetvli zero, zero, e8, mf4, ta, ma
|
||||
|
||||
vnclipu.wi v4, v0, 0
|
||||
vnclipu.wi v5, v1, 0
|
||||
vnclipu.wi v6, v2, 0
|
||||
vnclipu.wi v7, v3, 0
|
||||
|
||||
vse8.v v4, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v5, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v6, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v7, (a0)
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function inv_identity_e16_x4_rvv, export=1, ext=v
|
||||
li t1, (5793-4096)*8
|
||||
vsmul.vx v4, v0, t1
|
||||
vsmul.vx v5, v1, t1
|
||||
vsmul.vx v6, v2, t1
|
||||
vsmul.vx v7, v3, t1
|
||||
|
||||
vsadd.vv v0, v0, v4
|
||||
vsadd.vv v1, v1, v5
|
||||
vsadd.vv v2, v2, v6
|
||||
vsadd.vv v3, v3, v7
|
||||
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
.macro idct_4 o0, o1, o2, o3
|
||||
li t1, 2896
|
||||
li t2, 1567
|
||||
li t3, 3784
|
||||
|
||||
vwmul.vx v8, \o0, t1
|
||||
vwmul.vx v10, \o0, t1
|
||||
vwmacc.vx v8, t1, \o2
|
||||
neg t1, t1
|
||||
vwmacc.vx v10, t1, \o2
|
||||
|
||||
vwmul.vx v12, \o1, t3
|
||||
neg t3, t3
|
||||
vwmul.vx v14, \o1, t2
|
||||
vwmacc.vx v12, t2, \o3
|
||||
vwmacc.vx v14, t3, \o3
|
||||
|
||||
li t1, 2048
|
||||
|
||||
vwadd.wx v8, v8, t1
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
vwadd.wx v14, v14, t1
|
||||
|
||||
vnsra.wi v8, v8, 12
|
||||
vnsra.wi v10, v10, 12
|
||||
vnsra.wi v12, v12, 12
|
||||
vnsra.wi v14, v14, 12
|
||||
|
||||
vsadd.vv \o0, v8, v12
|
||||
vsadd.vv \o1, v10, v14
|
||||
vssub.vv \o2, v10, v14
|
||||
vssub.vv \o3, v8, v12
|
||||
.endm
|
||||
|
||||
.macro iadst_4 o0, o1, o2, o3
|
||||
li t1, 1321
|
||||
li t2, 3803
|
||||
li t3, 2482
|
||||
|
||||
vwmul.vx v4, v0, t1
|
||||
vwmul.vx v5, v0, t3
|
||||
neg t1, t1
|
||||
vwmacc.vx v4, t2, v2
|
||||
vwmacc.vx v5, t1, v2
|
||||
neg t2, t2
|
||||
vwmacc.vx v4, t3, v3
|
||||
vwmacc.vx v5, t2, v3
|
||||
|
||||
vwsub.vv v6, v0, v2
|
||||
vwadd.wv v6, v6, v3
|
||||
|
||||
li t1, 3344
|
||||
vwmul.vx v7, v1, t1
|
||||
|
||||
vsetvli zero, zero, e32, m1, ta, ma
|
||||
|
||||
vmul.vx v6, v6, t1
|
||||
|
||||
vadd.vv v8, v4, v5
|
||||
vadd.vv v4, v4, v7
|
||||
vadd.vv v5, v5, v7
|
||||
vsub.vv v7, v8, v7
|
||||
|
||||
li t1, 2048
|
||||
|
||||
vadd.vx v4, v4, t1
|
||||
vadd.vx v5, v5, t1
|
||||
vadd.vx v6, v6, t1
|
||||
vadd.vx v7, v7, t1
|
||||
|
||||
vsetvli zero, zero, e16, mf2, ta, ma
|
||||
|
||||
vnsra.wi \o0, v4, 12
|
||||
vnsra.wi \o1, v5, 12
|
||||
vnsra.wi \o2, v6, 12
|
||||
vnsra.wi \o3, v7, 12
|
||||
.endm
|
||||
|
||||
function inv_dct_e16_x4_rvv, export=1, ext=v
|
||||
idct_4 v0, v1, v2, v3
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
function inv_adst_e16_x4_rvv, export=1, ext=v
|
||||
iadst_4 v0, v1, v2, v3
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
function inv_flipadst_e16_x4_rvv, export=1, ext=v
|
||||
iadst_4 v3, v2, v1, v0
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
.macro def_fn_4x4 txfm1, txfm2
|
||||
function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
|
||||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
beqz a3, 1f
|
||||
.endif
|
||||
la a4, inv_\txfm1\()_e16_x4_rvv
|
||||
la a5, inv_\txfm2\()_e16_x4_rvv
|
||||
j inv_txfm_add_4x4_rvv
|
||||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
1:
|
||||
csrw vxrm, zero
|
||||
vsetivli zero, 4, e16, mf2, ta, ma
|
||||
ld t2, (a2)
|
||||
li t1, 2896*8
|
||||
vmv.v.x v0, t2
|
||||
vsmul.vx v0, v0, t1
|
||||
sd x0, (a2)
|
||||
vsmul.vx v0, v0, t1
|
||||
vssra.vi v0, v0, 4
|
||||
vmv.v.v v1, v0
|
||||
vmv.v.v v2, v0
|
||||
vmv.v.v v3, v0
|
||||
j itx_4x4_end
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_4x4 dct, dct
|
||||
def_fn_4x4 identity, identity
|
||||
def_fn_4x4 dct, adst
|
||||
def_fn_4x4 dct, flipadst
|
||||
def_fn_4x4 dct, identity
|
||||
def_fn_4x4 adst, dct
|
||||
def_fn_4x4 adst, adst
|
||||
def_fn_4x4 adst, flipadst
|
||||
def_fn_4x4 flipadst, dct
|
||||
def_fn_4x4 flipadst, adst
|
||||
def_fn_4x4 flipadst, flipadst
|
||||
def_fn_4x4 identity, dct
|
||||
def_fn_4x4 adst, identity
|
||||
def_fn_4x4 flipadst, identity
|
||||
def_fn_4x4 identity, adst
|
||||
def_fn_4x4 identity, flipadst
|
||||
|
||||
.macro def_fn_8x8_base variant
|
||||
function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
|
||||
csrw vxrm, zero
|
||||
|
||||
vsetivli zero, 8, e16, m1, ta, ma
|
||||
vle16.v v0, (a2)
|
||||
addi t0, a2, 16
|
||||
vle16.v v1, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v2, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v3, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v4, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v5, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v6, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v7, (t0)
|
||||
|
||||
.ifc \variant, identity_
|
||||
// The identity vsadd.vv and downshift vssra.vi 1 cancel out
|
||||
.else
|
||||
jalr t0, a4
|
||||
|
||||
vssra.vi v0, v0, 1
|
||||
vssra.vi v1, v1, 1
|
||||
vssra.vi v2, v2, 1
|
||||
vssra.vi v3, v3, 1
|
||||
vssra.vi v4, v4, 1
|
||||
vssra.vi v5, v5, 1
|
||||
vssra.vi v6, v6, 1
|
||||
vssra.vi v7, v7, 1
|
||||
.endif
|
||||
|
||||
vsseg8e16.v v0, (a2)
|
||||
vle16.v v0, (a2)
|
||||
addi t0, a2, 16
|
||||
vle16.v v1, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v2, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v3, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v4, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v5, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v6, (t0)
|
||||
addi t0, t0, 16
|
||||
vle16.v v7, (t0)
|
||||
|
||||
jalr t0, a5
|
||||
|
||||
vssra.vi v0, v0, 4
|
||||
vssra.vi v1, v1, 4
|
||||
vssra.vi v2, v2, 4
|
||||
vssra.vi v3, v3, 4
|
||||
vssra.vi v4, v4, 4
|
||||
vssra.vi v5, v5, 4
|
||||
vssra.vi v6, v6, 4
|
||||
vssra.vi v7, v7, 4
|
||||
|
||||
li t1, 64
|
||||
vsetvli zero, t1, e16, m8, ta, ma
|
||||
vmv.v.x v8, zero
|
||||
vse16.v v8, (a2)
|
||||
|
||||
.ifc \variant, identity_
|
||||
itx_8x8_end:
|
||||
.endif
|
||||
vsetivli zero, 8, e8, mf2, ta, ma
|
||||
vle8.v v8, (a0)
|
||||
add t0, a0, a1
|
||||
vle8.v v9, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v10, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v11, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v12, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v13, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v14, (t0)
|
||||
add t0, t0, a1
|
||||
vle8.v v15, (t0)
|
||||
|
||||
vwaddu.wv v0, v0, v8
|
||||
vwaddu.wv v1, v1, v9
|
||||
vwaddu.wv v2, v2, v10
|
||||
vwaddu.wv v3, v3, v11
|
||||
vwaddu.wv v4, v4, v12
|
||||
vwaddu.wv v5, v5, v13
|
||||
vwaddu.wv v6, v6, v14
|
||||
vwaddu.wv v7, v7, v15
|
||||
|
||||
vsetvli zero, zero, e16, m1
|
||||
vmax.vx v0, v0, zero
|
||||
vmax.vx v1, v1, zero
|
||||
vmax.vx v2, v2, zero
|
||||
vmax.vx v3, v3, zero
|
||||
vmax.vx v4, v4, zero
|
||||
vmax.vx v5, v5, zero
|
||||
vmax.vx v6, v6, zero
|
||||
vmax.vx v7, v7, zero
|
||||
|
||||
vsetvli zero, zero, e8, mf2, ta, ma
|
||||
|
||||
vnclipu.wi v8, v0, 0
|
||||
vnclipu.wi v9, v1, 0
|
||||
vnclipu.wi v10, v2, 0
|
||||
vnclipu.wi v11, v3, 0
|
||||
vnclipu.wi v12, v4, 0
|
||||
vnclipu.wi v13, v5, 0
|
||||
vnclipu.wi v14, v6, 0
|
||||
vnclipu.wi v15, v7, 0
|
||||
|
||||
vse8.v v8, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v9, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v10, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v11, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v12, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v13, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v14, (a0)
|
||||
add a0, a0, a1
|
||||
vse8.v v15, (a0)
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_8x8_base
|
||||
def_fn_8x8_base identity_
|
||||
|
||||
function inv_identity_e16_x8_rvv, export=1, ext=v
|
||||
vsadd.vv v0, v0, v0
|
||||
vsadd.vv v1, v1, v1
|
||||
vsadd.vv v2, v2, v2
|
||||
vsadd.vv v3, v3, v3
|
||||
vsadd.vv v4, v4, v4
|
||||
vsadd.vv v5, v5, v5
|
||||
vsadd.vv v6, v6, v6
|
||||
vsadd.vv v7, v7, v7
|
||||
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
function inv_dct_e16_x8_rvv, export=1, ext=v
|
||||
idct_4 v0, v2, v4, v6
|
||||
|
||||
li t1, 799
|
||||
li t2, 4017
|
||||
li t3, 3406
|
||||
li t4, 2276
|
||||
|
||||
vwmul.vx v14, v1, t2
|
||||
neg t2, t2
|
||||
vwmul.vx v8, v1, t1
|
||||
vwmacc.vx v14, t1, v7
|
||||
vwmacc.vx v8, t2, v7
|
||||
|
||||
vwmul.vx v12, v5, t4
|
||||
neg t4, t4
|
||||
vwmul.vx v10, v5, t3
|
||||
vwmacc.vx v12, t3, v3
|
||||
vwmacc.vx v10, t4, v3
|
||||
|
||||
li t1, 2048
|
||||
|
||||
vwadd.wx v8, v8, t1
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
vwadd.wx v14, v14, t1
|
||||
|
||||
vnsra.wi v8, v8, 12
|
||||
vnsra.wi v10, v10, 12
|
||||
vnsra.wi v12, v12, 12
|
||||
vnsra.wi v14, v14, 12
|
||||
|
||||
vssub.vv v7, v14, v12
|
||||
vsadd.vv v14, v14, v12
|
||||
vssub.vv v1, v8, v10
|
||||
vsadd.vv v8, v8, v10
|
||||
|
||||
li t2, 2896
|
||||
|
||||
vwmul.vx v10, v7, t2
|
||||
vwmul.vx v12, v7, t2
|
||||
vwmacc.vx v12, t2, v1
|
||||
neg t2, t2
|
||||
vwmacc.vx v10, t2, v1
|
||||
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
|
||||
vnsra.wi v10, v10, 12
|
||||
vnsra.wi v12, v12, 12
|
||||
|
||||
vssub.vv v7, v0, v14
|
||||
vsadd.vv v0, v0, v14
|
||||
vssub.vv v9, v2, v12
|
||||
vsadd.vv v1, v2, v12
|
||||
vssub.vv v5, v4, v10
|
||||
vsadd.vv v2, v4, v10
|
||||
vssub.vv v4, v6, v8
|
||||
vsadd.vv v3, v6, v8
|
||||
vmv.v.v v6, v9
|
||||
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
|
||||
li t1, 4076
|
||||
li t2, 401
|
||||
li t3, 3612
|
||||
li t4, 1931
|
||||
li t5, 2598
|
||||
li t6, 3166
|
||||
|
||||
vwmul.vx v8, v7, t1
|
||||
neg t1, t1
|
||||
vwmul.vx v10, v7, t2
|
||||
vwmacc.vx v8, t2, v0
|
||||
vwmacc.vx v10, t1, v0
|
||||
|
||||
vwmul.vx v12, v5, t3
|
||||
neg t3, t3
|
||||
vwmul.vx v14, v5, t4
|
||||
vwmacc.vx v12, t4, v2
|
||||
vwmacc.vx v14, t3, v2
|
||||
|
||||
vwmul.vx v16, v3, t5
|
||||
neg t5, t5
|
||||
vwmul.vx v18, v3, t6
|
||||
vwmacc.vx v16, t6, v4
|
||||
vwmacc.vx v18, t5, v4
|
||||
|
||||
li t1, 2048
|
||||
li t2, 1189
|
||||
li t3, 3920
|
||||
li t4, 1567
|
||||
li t5, 3784
|
||||
li t6, 2896
|
||||
|
||||
vwmul.vx v20, v1, t2
|
||||
neg t2, t2
|
||||
vwmul.vx v22, v1, t3
|
||||
vwmacc.vx v20, t3, v6
|
||||
vwmacc.vx v22, t2, v6
|
||||
|
||||
vwadd.wx v8, v8, t1
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
vwadd.wx v14, v14, t1
|
||||
vwadd.wx v16, v16, t1
|
||||
vwadd.wx v18, v18, t1
|
||||
vwadd.wx v20, v20, t1
|
||||
vwadd.wx v22, v22, t1
|
||||
|
||||
vnsra.wi v8, v8, 12
|
||||
vnsra.wi v10, v10, 12
|
||||
vnsra.wi v12, v12, 12
|
||||
vnsra.wi v14, v14, 12
|
||||
vnsra.wi v16, v16, 12
|
||||
vnsra.wi v18, v18, 12
|
||||
vnsra.wi v20, v20, 12
|
||||
vnsra.wi v22, v22, 12
|
||||
|
||||
vssub.vv v4, v8, v16
|
||||
vsadd.vv v8, v8, v16
|
||||
vsadd.vv v1, v10, v18
|
||||
vsadd.vv v2, v12, v20
|
||||
vsadd.vv v3, v14, v22
|
||||
vssub.vv v5, v10, v18
|
||||
vssub.vv v6, v12, v20
|
||||
vssub.vv v22, v14, v22
|
||||
|
||||
vsadd.vv \o0, v8, v2
|
||||
vsadd.vv \o7, v1, v3
|
||||
vssub.vv v2, v8, v2
|
||||
vssub.vv v3, v1, v3
|
||||
|
||||
vwmul.vx v8, v4, t5
|
||||
vwmul.vx v10, v4, t4
|
||||
vwmul.vx v12, v22, t5
|
||||
vwmul.vx v14, v22, t4
|
||||
vwmacc.vx v8, t4, v5
|
||||
neg t4, t4
|
||||
vwmacc.vx v14, t5, v6
|
||||
neg t5, t5
|
||||
vwmacc.vx v12, t4, v6
|
||||
vwmacc.vx v10, t5, v5
|
||||
|
||||
vwadd.wx v8, v8, t1
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
vwadd.wx v14, v14, t1
|
||||
|
||||
vnsra.wi v8, v8, 12
|
||||
vnsra.wi v10, v10, 12
|
||||
vnsra.wi v12, v12, 12
|
||||
vnsra.wi v14, v14, 12
|
||||
|
||||
vsadd.vv \o1, v8, v12
|
||||
vsadd.vv \o6, v10, v14
|
||||
vssub.vv v8, v8, v12
|
||||
vssub.vv v9, v10, v14
|
||||
|
||||
vwmul.vx v10, v2, t6
|
||||
vwmul.vx v12, v2, t6
|
||||
vwmul.vx v14, v8, t6
|
||||
vwmul.vx v16, v8, t6
|
||||
vwmacc.vx v10, t6, v3
|
||||
vwmacc.vx v14, t6, v9
|
||||
neg t6, t6
|
||||
vwmacc.vx v12, t6, v3
|
||||
vwmacc.vx v16, t6, v9
|
||||
|
||||
vwadd.wx v10, v10, t1
|
||||
vwadd.wx v12, v12, t1
|
||||
vwadd.wx v14, v14, t1
|
||||
vwadd.wx v16, v16, t1
|
||||
|
||||
vnsra.wi \o3, v10, 12
|
||||
vnsra.wi \o4, v12, 12
|
||||
vnsra.wi \o2, v14, 12
|
||||
vnsra.wi \o5, v16, 12
|
||||
|
||||
vmv.v.x v8, zero
|
||||
vssub.vv \o1, v8, \o1
|
||||
vssub.vv \o3, v8, \o3
|
||||
vssub.vv \o5, v8, \o5
|
||||
vssub.vv \o7, v8, \o7
|
||||
.endm
|
||||
|
||||
function inv_adst_e16_x8_rvv, export=1, ext=v
|
||||
iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
function inv_flipadst_e16_x8_rvv, export=1, ext=v
|
||||
iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
|
||||
jr t0
|
||||
endfunc
|
||||
|
||||
.macro def_fn_8x8 txfm1, txfm2
|
||||
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
|
||||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
beqz a3, 1f
|
||||
.endif
|
||||
la a5, inv_\txfm2\()_e16_x8_rvv
|
||||
.ifc \txfm1, identity
|
||||
j inv_txfm_identity_add_8x8_rvv
|
||||
.else
|
||||
la a4, inv_\txfm1\()_e16_x8_rvv
|
||||
j inv_txfm_add_8x8_rvv
|
||||
.endif
|
||||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
1:
|
||||
csrw vxrm, zero
|
||||
vsetivli zero, 8, e16, m1, ta, ma
|
||||
ld t2, (a2)
|
||||
li t1, 2896*8
|
||||
vmv.v.x v0, t2
|
||||
vsmul.vx v0, v0, t1
|
||||
sd x0, (a2)
|
||||
vssra.vi v0, v0, 1
|
||||
vsmul.vx v0, v0, t1
|
||||
vssra.vi v0, v0, 4
|
||||
vmv.v.v v1, v0
|
||||
vmv.v.v v2, v0
|
||||
vmv.v.v v3, v0
|
||||
vmv.v.v v4, v0
|
||||
vmv.v.v v5, v0
|
||||
vmv.v.v v6, v0
|
||||
vmv.v.v v7, v0
|
||||
j itx_8x8_end
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_8x8 dct, dct
|
||||
def_fn_8x8 identity, identity
|
||||
def_fn_8x8 dct, adst
|
||||
def_fn_8x8 dct, flipadst
|
||||
def_fn_8x8 dct, identity
|
||||
def_fn_8x8 adst, dct
|
||||
def_fn_8x8 adst, adst
|
||||
def_fn_8x8 adst, flipadst
|
||||
def_fn_8x8 flipadst, dct
|
||||
def_fn_8x8 flipadst, adst
|
||||
def_fn_8x8 flipadst, flipadst
|
||||
def_fn_8x8 identity, dct
|
||||
def_fn_8x8 adst, identity
|
||||
def_fn_8x8 flipadst, identity
|
||||
def_fn_8x8 identity, adst
|
||||
def_fn_8x8 identity, flipadst
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_RISCV_ASM_S
|
||||
#define DAV1D_SRC_RISCV_ASM_S
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if !defined(PIC)
|
||||
#if defined(__PIC__)
|
||||
#define PIC __PIC__
|
||||
#elif defined(__pic__)
|
||||
#define PIC __pic__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef PRIVATE_PREFIX
|
||||
#define PRIVATE_PREFIX dav1d_
|
||||
#endif
|
||||
|
||||
#define PASTE(a,b) a ## b
|
||||
#define CONCAT(a,b) PASTE(a,b)
|
||||
|
||||
#ifdef PREFIX
|
||||
#define EXTERN CONCAT(_,PRIVATE_PREFIX)
|
||||
#else
|
||||
#define EXTERN PRIVATE_PREFIX
|
||||
#endif
|
||||
|
||||
.macro function name, export=0, ext=
|
||||
.macro endfunc
|
||||
#ifdef __ELF__
|
||||
.size \name, . - \name
|
||||
#endif
|
||||
.option pop
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.option push
|
||||
.ifnb \ext
|
||||
.option arch, +\ext
|
||||
.endif
|
||||
.if \export
|
||||
.global EXTERN\name
|
||||
#ifdef __ELF__
|
||||
.type EXTERN\name, %function
|
||||
.hidden EXTERN\name
|
||||
#elif defined(__MACH__)
|
||||
.private_extern EXTERN\name
|
||||
#endif
|
||||
EXTERN\name:
|
||||
.else
|
||||
#ifdef __ELF__
|
||||
.type \name, %function
|
||||
#endif
|
||||
.endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro const name, export=0, align=2
|
||||
.macro endconst
|
||||
#ifdef __ELF__
|
||||
.size \name, . - \name
|
||||
#endif
|
||||
.purgem endconst
|
||||
.endm
|
||||
#if defined(_WIN32)
|
||||
.section .rdata
|
||||
#elif !defined(__MACH__)
|
||||
.section .rodata
|
||||
#else
|
||||
.const_data
|
||||
#endif
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN\name
|
||||
#ifdef __ELF__
|
||||
.hidden EXTERN\name
|
||||
#elif defined(__MACH__)
|
||||
.private_extern EXTERN\name
|
||||
#endif
|
||||
EXTERN\name:
|
||||
.endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro thread_local name, align=3, quads=1
|
||||
.macro end_thread_local
|
||||
.size \name, . - \name
|
||||
.purgem end_thread_local
|
||||
.endm
|
||||
.section .tbss, "waT"
|
||||
.align \align
|
||||
.hidden \name
|
||||
\name:
|
||||
.rept \quads
|
||||
.quad 0
|
||||
.endr
|
||||
end_thread_local
|
||||
.endm
|
||||
|
||||
#endif /* DAV1D_SRC_RISCV_ASM_S */
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Copyright © 2022, VideoLAN and dav1d authors
|
||||
* Copyright © 2022, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/riscv/cpu.h"
|
||||
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
#include <sys/auxv.h>
|
||||
|
||||
#define HWCAP_RVV (1 << ('v' - 'a'))
|
||||
|
||||
#endif
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
|
||||
unsigned flags = 0;
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright © 2022, VideoLAN and dav1d authors
|
||||
* Copyright © 2022, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_RISCV_CPU_H
|
||||
#define DAV1D_SRC_RISCV_CPU_H
|
||||
|
||||
enum CpuFlags {
|
||||
DAV1D_RISCV_CPU_FLAG_V = 1 << 0,
|
||||
};
|
||||
|
||||
unsigned dav1d_get_cpu_flags_riscv(void);
|
||||
|
||||
#endif /* DAV1D_SRC_RISCV_CPU_H */
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/itx.h"
|
||||
|
||||
#define decl_itx2_fns(w, h, opt) \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
|
||||
|
||||
#define decl_itx12_fns(w, h, opt) \
|
||||
decl_itx2_fns(w, h, opt); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
|
||||
|
||||
#define decl_itx16_fns(w, h, opt) \
|
||||
decl_itx12_fns(w, h, opt); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
|
||||
|
||||
#define decl_itx17_fns(w, h, opt) \
|
||||
decl_itx16_fns(w, h, opt); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
|
||||
|
||||
#define decl_itx_fns(ext) \
|
||||
decl_itx17_fns( 4, 4, ext); \
|
||||
decl_itx16_fns( 8, 8, ext)
|
||||
|
||||
decl_itx_fns(rvv);
|
||||
|
||||
static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
|
||||
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
|
||||
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
|
||||
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
|
||||
|
||||
#define assign_itx1_fn(pfx, w, h, ext) \
|
||||
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
|
||||
|
||||
#define assign_itx2_fn(pfx, w, h, ext) \
|
||||
assign_itx1_fn(pfx, w, h, ext); \
|
||||
assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
|
||||
|
||||
#define assign_itx12_fn(pfx, w, h, ext) \
|
||||
assign_itx2_fn(pfx, w, h, ext); \
|
||||
assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
|
||||
assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
|
||||
assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
|
||||
assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
|
||||
|
||||
#define assign_itx16_fn(pfx, w, h, ext) \
|
||||
assign_itx12_fn(pfx, w, h, ext); \
|
||||
assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
|
||||
assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
|
||||
|
||||
#define assign_itx17_fn(pfx, w, h, ext) \
|
||||
assign_itx16_fn(pfx, w, h, ext); \
|
||||
assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
|
||||
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
assign_itx16_fn( , 4, 4, rvv);
|
||||
assign_itx16_fn( , 8, 8, rvv);
|
||||
#endif
|
||||
}
|
|
@ -138,13 +138,13 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
|
|||
init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
|
||||
init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
|
||||
init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
|
||||
init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
|
||||
#endif
|
||||
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
|
||||
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
|
||||
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
|
||||
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
|
||||
init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl);
|
||||
init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
|
||||
init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
|
||||
|
||||
c->pal_pred = BF(dav1d_pal_pred, avx512icl);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; Copyright © 2022, VideoLAN and dav1d authors
|
||||
; Copyright © 2022, Two Orioles, LLC
|
||||
; Copyright © 2022-2024, VideoLAN and dav1d authors
|
||||
; Copyright © 2022-2024, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
|
@ -42,12 +42,16 @@ pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
|
|||
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
|
||||
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
|
||||
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
|
||||
pw_0to31: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
dw 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
|
||||
dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
||||
z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6
|
||||
dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
|
||||
z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2
|
||||
dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4
|
||||
z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1
|
||||
dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3
|
||||
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
|
||||
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
|
||||
z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168
|
||||
|
@ -75,13 +79,25 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
|
|||
z_filter_k: dw 8, 8, 6, 6, 4, 4
|
||||
dw 4, 4, 5, 5, 4, 4
|
||||
dw 0, 0, 0, 0, 2, 2
|
||||
pw_15: times 2 dw 15
|
||||
pw_16: times 2 dw 16
|
||||
pw_17: times 2 dw 17
|
||||
pw_24: times 2 dw 24
|
||||
pw_32: times 2 dw 32
|
||||
pw_63: times 2 dw 63
|
||||
pw_64: times 2 dw 64
|
||||
pw_512: times 2 dw 512
|
||||
pw_31806: times 2 dw 31806
|
||||
pw_32640: times 2 dw 32640
|
||||
pw_32672: times 2 dw 32672
|
||||
pw_32704: times 2 dw 32704
|
||||
pw_32735: times 2 dw 32735
|
||||
pw_32736: times 2 dw 32736
|
||||
|
||||
%define pw_2 (z_xpos_mul+4* 2)
|
||||
%define pw_3 (z_xpos_mul+4* 4)
|
||||
%define pw_7 (z_xpos_mul+4*12)
|
||||
%define pw_0to31 (pw_1to32-2)
|
||||
|
||||
%macro JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - 2*4)
|
||||
|
@ -98,6 +114,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
|
|||
JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
|
||||
cextern smooth_weights_1d_16bpc
|
||||
|
@ -757,7 +774,7 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
|
|||
lea r3d, [angleq+216]
|
||||
movu ym5, [tlq]
|
||||
mov r3b, hb
|
||||
mova m10, [base+pw_0to31]
|
||||
movu m10, [base+pw_0to31]
|
||||
cmp r3d, 8
|
||||
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
|
||||
lea r3d, [hq+7]
|
||||
|
@ -1157,6 +1174,638 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
|
|||
mov rsp, r7
|
||||
RET
|
||||
|
||||
cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
|
||||
lea r7, [z_filter_t0]
|
||||
tzcnt wd, wm
|
||||
movifnidn angled, anglem
|
||||
lea t0, [dr_intra_derivative+45*2-1]
|
||||
movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
|
||||
sub angled, 180
|
||||
mov dyd, angled
|
||||
neg dyd
|
||||
xor angled, 0x400
|
||||
or dyq, ~0x7e
|
||||
mova m0, [base+pw_31to0]
|
||||
movzx dyd, word [t0+dyq]
|
||||
lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
|
||||
movifnidn hd, hm
|
||||
vpbroadcastd m14, [base+pw_31806]
|
||||
vpbroadcastd m15, [base+pw_1]
|
||||
jmp wq
|
||||
.w4:
|
||||
lea r3d, [hq+3]
|
||||
xor r3d, 31 ; 32 - (h + imin(w, h))
|
||||
vpbroadcastw m7, r3d
|
||||
pmaxuw m7, m0
|
||||
vpermw m6, m7, [tlq-64*1]
|
||||
test angled, 0x400 ; !enable_intra_edge_filter
|
||||
jnz .w4_main
|
||||
cmp angleb, 40
|
||||
jae .w4_filter
|
||||
lea r3d, [angleq-1024]
|
||||
sar r3d, 7
|
||||
add r3d, hd
|
||||
jg .w4_filter ; h > 8 || (h == 8 && is_sm)
|
||||
call .upsample
|
||||
movsldup m1, [base+z_ypos_mul]
|
||||
paddw m1, m1
|
||||
jmp .w4_main2
|
||||
.w4_filter:
|
||||
lea r3d, [hq+3]
|
||||
call .filter32
|
||||
.w4_main:
|
||||
movsldup m1, [base+z_ypos_mul]
|
||||
.w4_main2:
|
||||
vpbroadcastq m0, [base+pw_1to32]
|
||||
vpbroadcastw m4, dyd
|
||||
lea r2d, [hq+4]
|
||||
shr r2d, 3
|
||||
pmullw m4, m0 ; ypos
|
||||
vpbroadcastw m0, r2d
|
||||
imul r2, strideq ; stride * imax(height / 8, 1)
|
||||
pmullw m1, m0
|
||||
lea r3, [r2*3]
|
||||
paddd m1, [base+pw_32736] {1to16}
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
vpermw m3, m2, m6 ; left[base+0]
|
||||
.w4_loop:
|
||||
paddsw m2, m15 ; base+1
|
||||
vpermw m1, m2, m6 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m3
|
||||
movq [dstq+r2*0], xm0
|
||||
movhps [dstq+r2*1], xm0
|
||||
vextracti32x4 xm3, ym0, 1
|
||||
movq [dstq+r2*2], xm3
|
||||
movhps [dstq+r3 ], xm3
|
||||
sub hd, 8
|
||||
jl .w4_end
|
||||
lea r5, [dstq+r2*4]
|
||||
vextracti32x8 ym0, m0, 1
|
||||
mova m3, m1
|
||||
movq [r5+r2*0], xm0
|
||||
movhps [r5+r2*1], xm0
|
||||
vextracti32x4 xm1, ym0, 1
|
||||
movq [r5+r2*2], xm1
|
||||
movhps [r5+r3 ], xm1
|
||||
add dstq, strideq
|
||||
test hd, hd
|
||||
jnz .w4_loop
|
||||
.w4_end:
|
||||
RET
|
||||
.upsample:
|
||||
vinserti32x4 m6, [tlq-14], 3
|
||||
mova m3, [base+z_upsample]
|
||||
vpbroadcastd m4, [base+pd_65536]
|
||||
add dyd, dyd
|
||||
vpermw m0, m3, m6
|
||||
paddw m3, m4
|
||||
vpermw m1, m3, m6
|
||||
paddw m3, m4
|
||||
vpermw m2, m3, m6
|
||||
paddw m3, m4
|
||||
vpermw m3, m3, m6
|
||||
vpbroadcastw m6, r9m ; pixel_max
|
||||
paddw m1, m2 ; b+c
|
||||
paddw m0, m3 ; a+d
|
||||
psubw m0, m1, m0
|
||||
psraw m0, 3
|
||||
pxor m2, m2
|
||||
paddw m0, m1
|
||||
pmaxsw m0, m2
|
||||
pavgw m0, m2
|
||||
pminsw m6, m0
|
||||
ret
|
||||
.w8:
|
||||
mova m6, [tlq-64*1]
|
||||
cmp hd, 32
|
||||
je .w8_h32
|
||||
mov r3d, 8
|
||||
cmp hd, 4
|
||||
cmove r3d, hd
|
||||
lea r3d, [r3+hq-1]
|
||||
xor r3d, 31 ; 32 - (h + imin(w, h))
|
||||
vpbroadcastw m1, r3d
|
||||
vpermw m7, m1, m6
|
||||
pmaxuw m1, m0
|
||||
vpermw m6, m1, m6
|
||||
test angled, 0x400
|
||||
jnz .w8_main
|
||||
lea r3d, [angleq+216]
|
||||
mov r3b, hb
|
||||
cmp r3d, 8
|
||||
ja .w8_filter ; is_sm || d >= 40 || h > 8
|
||||
call .upsample
|
||||
movshdup m1, [base+z_ypos_mul]
|
||||
paddw m1, m1
|
||||
call .w8_main_setup
|
||||
.w8_upsample_loop:
|
||||
vpermw m3, m2, m6 ; left[base+0]
|
||||
paddw m2, m15 ; base+1
|
||||
vpermw m1, m2, m6 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddw m2, m15 ; base+2
|
||||
paddw m0, m3
|
||||
mova m3, m1
|
||||
mova [dstq+r2*0], xm0
|
||||
vextracti32x4 [dstq+r2*1], ym0, 1
|
||||
vextracti32x4 [dstq+r2*2], m0, 2
|
||||
vextracti32x4 [dstq+r3 ], m0, 3
|
||||
add dstq, strideq
|
||||
sub hd, 4
|
||||
jg .w8_upsample_loop
|
||||
RET
|
||||
.w8_main_setup:
|
||||
vbroadcasti32x4 m0, [base+pw_1to32]
|
||||
vpbroadcastw m4, dyd
|
||||
rorx r2d, hd, 2
|
||||
pmullw m4, m0 ; ypos
|
||||
vpbroadcastw m0, r2d
|
||||
imul r2, strideq ; stride * height / 4
|
||||
lea r3, [r2*3]
|
||||
pmullw m1, m0 ; 0 1 2 3
|
||||
paddd m1, [base+pw_32704] {1to16}
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
ret
|
||||
.w8_h32:
|
||||
pmaxud m7, m0, [base+pw_24] {1to16}
|
||||
vpermw m6, m0, m6
|
||||
vpermw m7, m7, [tlq-64*2]
|
||||
test angled, 0x400
|
||||
jnz .w8_main
|
||||
call .filter64
|
||||
vpbroadcastd m0, [base+pw_7]
|
||||
pminuw m0, [base+pw_0to31]
|
||||
vpermw m7, m0, m7
|
||||
jmp .w8_main
|
||||
.w8_filter:
|
||||
lea r3d, [hq+7]
|
||||
call .filter32
|
||||
.w8_main:
|
||||
movshdup m1, [base+z_ypos_mul]
|
||||
call .w8_main_setup
|
||||
mova m3, m6
|
||||
vpermt2w m3, m2, m7 ; left[base+0]
|
||||
.w8_loop:
|
||||
paddsw m2, m15 ; base+1
|
||||
mova m1, m6
|
||||
vpermt2w m1, m2, m7 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m3
|
||||
mova m3, m1
|
||||
mova [dstq+r2*0], xm0
|
||||
vextracti32x4 [dstq+r2*1], ym0, 1
|
||||
vextracti32x4 [dstq+r2*2], m0, 2
|
||||
vextracti32x4 [dstq+r3 ], m0, 3
|
||||
add dstq, strideq
|
||||
sub hd, 4
|
||||
jg .w8_loop
|
||||
RET
|
||||
.filter32:
|
||||
vpbroadcastb ym10, r3d
|
||||
vpbroadcastb ym1, angled
|
||||
shr angled, 8
|
||||
vpcmpeqb k1, ym10, [base+z_filter_wh]
|
||||
mova xm2, [base+z_filter_t0+angleq*8]
|
||||
vpcmpgtb k1{k1}, ym1, ym2
|
||||
kmovd r5d, k1
|
||||
test r5d, r5d
|
||||
jz .filter32_end
|
||||
vpbroadcastw m2, [tlq]
|
||||
popcnt r5d, r5d
|
||||
vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0]
|
||||
valignq m2, m6, m2, 6
|
||||
vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
|
||||
valignq m4, m7, m6, 2
|
||||
vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
|
||||
palignr m1, m6, m2, 14
|
||||
pmullw m5, m6
|
||||
palignr m3, m4, m6, 2
|
||||
paddw m1, m3
|
||||
palignr m2, m6, m2, 12
|
||||
pmullw m1, m8
|
||||
palignr m4, m6, 4
|
||||
paddw m2, m4
|
||||
pmullw m2, m9
|
||||
pmovzxbw m10, ym10
|
||||
pxor m6, m6
|
||||
paddw m5, m1
|
||||
pminuw m1, m10, [base+pw_0to31]
|
||||
paddw m5, m2
|
||||
psrlw m5, 3
|
||||
pavgw m6, m5
|
||||
vpermw m7, m10, m6
|
||||
vpermw m6, m1, m6
|
||||
.filter32_end:
|
||||
ret
|
||||
.w16:
|
||||
mova m6, [tlq-64*1]
|
||||
cmp hd, 32
|
||||
jl .w16_h16
|
||||
pmaxud m8, m0, [base+pw_16] {1to16}
|
||||
mova m7, [tlq-64*2]
|
||||
vpermw m6, m0, m6
|
||||
jg .w16_h64
|
||||
vpermw m7, m8, m7
|
||||
test angled, 0x400
|
||||
jnz .w16_main
|
||||
call .filter64
|
||||
vpbroadcastd m0, [base+pw_15]
|
||||
vinserti32x8 m0, [base+pw_0to31], 0
|
||||
vpermw m7, m0, m7
|
||||
jmp .w16_main
|
||||
.w16_h16:
|
||||
lea r3d, [hq*2-1]
|
||||
xor r3d, 31 ; 32 - (h + imin(w, h))
|
||||
vpbroadcastw m1, r3d
|
||||
vpermw m7, m1, m6
|
||||
pmaxuw m1, m0
|
||||
vpermw m6, m1, m6
|
||||
test angled, 0x400
|
||||
jnz .w16_main
|
||||
lea r3d, [hq+15]
|
||||
call .filter32
|
||||
.w16_main:
|
||||
vbroadcasti32x8 m0, [base+pw_1to32]
|
||||
vpbroadcastw m4, dyd
|
||||
rorx r2d, hd, 1
|
||||
pmullw m4, m0 ; ypos
|
||||
vpbroadcastw ym1, r2d
|
||||
imul r2, strideq ; stride * height / 2
|
||||
paddd m1, [base+pw_32704] {1to16}
|
||||
lea r3, [r2+strideq]
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
mova m3, m6
|
||||
vpermt2w m3, m2, m7 ; left[base+0]
|
||||
.w16_loop:
|
||||
paddsw m1, m2, m15 ; base+1
|
||||
paddsw m2, m1, m15 ; base+2
|
||||
vpermi2w m1, m6, m7 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m3
|
||||
mova m3, m6
|
||||
vpermt2w m3, m2, m7 ; left[base+2]
|
||||
vextracti32x8 [dstq+strideq*0], m0, 1
|
||||
mova [dstq+r2 ], ym0
|
||||
psubw m0, m3, m1
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m1
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+r3 ], ym0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 4
|
||||
jg .w16_loop
|
||||
RET
|
||||
.w16_h64:
|
||||
vpermw m7, m0, m7
|
||||
vpermw m8, m8, [tlq-64*3]
|
||||
test angled, 0x400
|
||||
jnz .w16_h64_main
|
||||
valignq m11, m8, m7, 6
|
||||
call .filter64
|
||||
vshufi32x4 m2, m8, m8, q3321
|
||||
vpbroadcastd m0, [base+pw_15]
|
||||
palignr ym3, ym8, ym11, 12
|
||||
vinserti32x8 m0, [base+pw_0to31], 0
|
||||
palignr ym4, ym8, ym11, 14
|
||||
palignr ym1, ym2, ym8, 4
|
||||
paddw ym3, ym5
|
||||
palignr ym2, ym8, 2
|
||||
paddw ym8, ym4
|
||||
pavgw ym3, ym1
|
||||
paddw ym8, ym2
|
||||
paddw ym8, ym3
|
||||
psrlw ym8, 2
|
||||
vpermw m8, m0, m8
|
||||
.w16_h64_main:
|
||||
vbroadcasti32x8 m0, [base+pw_1to32]
|
||||
vpbroadcastw m4, dyd
|
||||
pmullw m4, m0 ; ypos
|
||||
vpbroadcastd ym1, [base+pw_32]
|
||||
paddd m1, [base+pw_32672] {1to16}
|
||||
mov r2, strideq
|
||||
shl r2, 5 ; stride*32
|
||||
vpbroadcastd m9, [base+pw_32735]
|
||||
lea r3, [r2+strideq]
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
mova m3, m7
|
||||
vpermt2w m3, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m3{k1}, m2, m8 ; left[base+0]
|
||||
.w16_h64_loop:
|
||||
paddsw m2, m15 ; base+1
|
||||
mova m1, m7
|
||||
vpermt2w m1, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m1{k1}, m2, m8 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddsw m2, m15 ; base+2
|
||||
paddw m0, m3
|
||||
mova m3, m7
|
||||
vpermt2w m3, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m3{k1}, m2, m8 ; left[base+2]
|
||||
vextracti32x8 [dstq+strideq*0], m0, 1
|
||||
mova [dstq+r2 ], ym0
|
||||
psubw m0, m3, m1
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m1
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+r3 ], ym0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 4
|
||||
jg .w16_h64_loop
|
||||
RET
|
||||
.filter64:
|
||||
vpbroadcastw m2, [tlq]
|
||||
vpbroadcastd m5, [base+pw_3]
|
||||
valignq m2, m6, m2, 6
|
||||
valignq m4, m7, m6, 2
|
||||
valignq m10, m7, m6, 6
|
||||
palignr m1, m6, m2, 12
|
||||
palignr m2, m6, m2, 14
|
||||
palignr m3, m4, m6, 4
|
||||
paddw m1, m5
|
||||
palignr m4, m6, 2
|
||||
paddw m6, m2
|
||||
valignq m2, m8, m7, 2
|
||||
pavgw m1, m3
|
||||
palignr m3, m7, m10, 12
|
||||
paddw m6, m4
|
||||
palignr m4, m7, m10, 14
|
||||
paddw m6, m1
|
||||
palignr m1, m2, m7, 4
|
||||
psrlw m6, 2
|
||||
palignr m2, m7, 2
|
||||
paddw m3, m5
|
||||
paddw m7, m4
|
||||
pavgw m3, m1
|
||||
paddw m7, m2
|
||||
paddw m7, m3
|
||||
psrlw m7, 2
|
||||
ret
|
||||
.w32:
|
||||
mova m6, [tlq-64*1]
|
||||
cmp hd, 32
|
||||
jl .w32_h16
|
||||
mova m8, [tlq-64*2]
|
||||
vpermw m6, m0, m6
|
||||
vpermw m7, m0, m8
|
||||
jg .w32_h64
|
||||
test angled, 0x400
|
||||
jnz .w32_main
|
||||
vpbroadcastw xm8, xm8
|
||||
jmp .w32_filter
|
||||
.w32_h16:
|
||||
lea r3d, [hq*2-1]
|
||||
xor r3d, 31 ; 32 - (h + imin(w, h))
|
||||
vpbroadcastw m1, r3d
|
||||
vpermw m7, m1, m6
|
||||
pmaxuw m1, m0
|
||||
vpermw m6, m1, m6
|
||||
test angled, 0x400
|
||||
jnz .w32_main
|
||||
vextracti32x4 xm8, m7, 3
|
||||
.w32_filter:
|
||||
call .filter64
|
||||
.w32_main:
|
||||
vpbroadcastw m4, dyd
|
||||
vpbroadcastd m1, [base+pw_32704]
|
||||
pmullw m4, [base+pw_1to32] ; ypos
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
mova m3, m6
|
||||
vpermt2w m3, m2, m7 ; left[base+0]
|
||||
.w32_loop:
|
||||
paddsw m1, m2, m15 ; base+1
|
||||
paddsw m2, m1, m15 ; base+2
|
||||
vpermi2w m1, m6, m7 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m3
|
||||
mova m3, m6
|
||||
vpermt2w m3, m2, m7 ; left[base+2]
|
||||
mova [dstq+strideq*0], m0
|
||||
psubw m0, m3, m1
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m1
|
||||
mova [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32_loop
|
||||
RET
|
||||
.w32_h64:
|
||||
mova m9, [tlq-64*3]
|
||||
vpermw m8, m0, m9
|
||||
test angled, 0x400
|
||||
jnz .w32_h64_main
|
||||
vpbroadcastw xm9, xm9
|
||||
call .filter96
|
||||
.w32_h64_main:
|
||||
vpbroadcastw m4, dyd
|
||||
vpbroadcastd m1, [base+pw_32672]
|
||||
pmullw m4, [base+pw_1to32] ; ypos
|
||||
vpbroadcastd m9, [base+pw_32735]
|
||||
psrlw m2, m4, 6
|
||||
psllw m4, 9
|
||||
paddsw m2, m1 ; base+0
|
||||
vpandd m4, m14 ; frac << 9
|
||||
mova m3, m7
|
||||
vpermt2w m3, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m3{k1}, m2, m8 ; left[base+0]
|
||||
.w32_h64_loop:
|
||||
paddsw m2, m15 ; base+1
|
||||
mova m1, m7
|
||||
vpermt2w m1, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m1{k1}, m2, m8 ; left[base+1]
|
||||
psubw m0, m1, m3
|
||||
pmulhrsw m0, m4
|
||||
paddsw m2, m15 ; base+2
|
||||
paddw m0, m3
|
||||
mova m3, m7
|
||||
vpermt2w m3, m2, m6
|
||||
vpcmpgtw k1, m2, m9
|
||||
vpermw m3{k1}, m2, m8 ; left[base+2]
|
||||
mova [dstq+strideq*0], m0
|
||||
psubw m0, m3, m1
|
||||
pmulhrsw m0, m4
|
||||
paddw m0, m1
|
||||
mova [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32_h64_loop
|
||||
RET
|
||||
.filter96:
|
||||
valignq m11, m8, m7, 6
|
||||
call .filter64
|
||||
valignq m2, m9, m8, 2
|
||||
palignr m3, m8, m11, 12
|
||||
palignr m4, m8, m11, 14
|
||||
palignr m1, m2, m8, 4
|
||||
paddw m3, m5
|
||||
palignr m2, m8, 2
|
||||
paddw m8, m4
|
||||
pavgw m3, m1
|
||||
paddw m8, m2
|
||||
paddw m8, m3
|
||||
psrlw m8, 2
|
||||
ret
|
||||
.w64:
|
||||
mova m7, [tlq-64*1]
|
||||
vpermw m6, m0, m7
|
||||
cmp hd, 32
|
||||
jl .w64_h16
|
||||
mova m8, [tlq-64*2]
|
||||
vpermw m7, m0, m8
|
||||
jg .w64_h64
|
||||
test angled, 0x400
|
||||
jnz .w64_main
|
||||
vpbroadcastw m8, xm8
|
||||
mova m9, m8
|
||||
call .filter96
|
||||
vshufi32x4 m9, m8, m8, q3333
|
||||
jmp .w64_h64_main
|
||||
.w64_h16:
|
||||
vpbroadcastw m7, xm7
|
||||
test angled, 0x400
|
||||
jnz .w64_main
|
||||
mova m8, m7
|
||||
call .filter64
|
||||
.w64_main:
|
||||
vpbroadcastw m11, dyd
|
||||
vpbroadcastd m1, [base+pw_32704]
|
||||
pmullw m10, m11, [base+pw_1to32] ; ypos
|
||||
psllw m11, 5
|
||||
psrlw m8, m10, 6
|
||||
paddw m11, m10
|
||||
psllw m10, 9
|
||||
psrlw m9, m11, 6
|
||||
psllw m11, 9
|
||||
psubw m9, m8
|
||||
paddsw m8, m1 ; base+0
|
||||
vpandd m10, m14 ; frac << 9
|
||||
vpandd m11, m14 ; frac << 9
|
||||
mova m4, m6
|
||||
vpermt2w m4, m8, m7 ; left[base+0] ( 0..31)
|
||||
paddsw m5, m8, m9
|
||||
vpermi2w m5, m6, m7 ; left[base+0] (32..63)
|
||||
.w64_loop:
|
||||
paddsw m8, m15 ; base+1 ( 0..31)
|
||||
mova m2, m6
|
||||
vpermt2w m2, m8, m7 ; left[base+1] ( 0..31)
|
||||
paddsw m3, m8, m9 ; base+1 (32..63)
|
||||
vpermi2w m3, m6, m7 ; left[base+1] (32..63)
|
||||
psubw m0, m2, m4
|
||||
psubw m1, m3, m5
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m11
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
mova m4, m2
|
||||
mova [dstq+64*0], m0
|
||||
mova m5, m3
|
||||
mova [dstq+64*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64_loop
|
||||
RET
|
||||
.w64_h64:
|
||||
vpermw m8, m0, [tlq-64*3]
|
||||
mova m13, [tlq-64*4]
|
||||
vpermw m9, m0, m13
|
||||
test angled, 0x400
|
||||
jnz .w64_h64_main
|
||||
valignq m12, m9, m8, 6
|
||||
call .filter96
|
||||
vpbroadcastw xm2, xm13
|
||||
valignq m2, m9, 2
|
||||
palignr m3, m9, m12, 12
|
||||
palignr m4, m9, m12, 14
|
||||
palignr m1, m2, m9, 4
|
||||
paddw m3, m5
|
||||
palignr m2, m9, 2
|
||||
paddw m9, m4
|
||||
pavgw m3, m1
|
||||
paddw m9, m2
|
||||
paddw m9, m3
|
||||
psrlw m9, 2
|
||||
.w64_h64_main:
|
||||
vpbroadcastw m11, dyd
|
||||
vpbroadcastd m1, [base+pw_32640]
|
||||
pmullw m10, m11, [base+pw_1to32] ; ypos
|
||||
psllw m11, 5
|
||||
psrlw m12, m10, 6
|
||||
paddw m11, m10
|
||||
psllw m10, 9
|
||||
psrlw m13, m11, 6
|
||||
psllw m11, 9
|
||||
psubw m13, m12
|
||||
paddsw m12, m1 ; base+0
|
||||
vpandd m10, m14 ; frac << 9
|
||||
vpandd m11, m14 ; frac << 9
|
||||
vpbroadcastd m14, [base+pw_64]
|
||||
mova m4, m6
|
||||
vpermt2w m4, m12, m7
|
||||
vptestmw k1, m12, m14
|
||||
mova m0, m8
|
||||
vpermt2w m0, m12, m9
|
||||
paddsw m1, m12, m13
|
||||
mova m5, m6
|
||||
vpermt2w m5, m1, m7
|
||||
vptestmw k2, m1, m14
|
||||
vpermi2w m1, m8, m9
|
||||
vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31)
|
||||
vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63)
|
||||
.w64_h64_loop:
|
||||
paddsw m12, m15 ; base+1
|
||||
mova m2, m6
|
||||
vpermt2w m2, m12, m7
|
||||
vptestmw k1, m12, m14
|
||||
mova m0, m8
|
||||
vpermt2w m0, m12, m9
|
||||
paddsw m1, m12, m13
|
||||
mova m3, m6
|
||||
vpermt2w m3, m1, m7
|
||||
vptestmw k2, m1, m14
|
||||
vpermi2w m1, m8, m9
|
||||
vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31)
|
||||
vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63)
|
||||
psubw m0, m2, m4
|
||||
psubw m1, m3, m5
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m11
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
mova m4, m2
|
||||
mova [dstq+64*0], m0
|
||||
mova m5, m3
|
||||
mova [dstq+64*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64_h64_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
|
||||
lea r6, [pal_pred_16bpc_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
|
|
@ -132,7 +132,8 @@ for d in "${dirs[@]}"; do
|
|||
fi
|
||||
done
|
||||
|
||||
if [ ${#files[@]} -eq 0 ]; then
|
||||
num_files="${#files[@]}"
|
||||
if [ "$num_files" -eq 0 ]; then
|
||||
error "Error! No files found at ${dirs[*]}"
|
||||
fi
|
||||
|
||||
|
@ -148,17 +149,17 @@ for i in "${!files[@]}"; do
|
|||
md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}"
|
||||
md5=${md5/ */}
|
||||
|
||||
printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f"
|
||||
printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}"
|
||||
cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
|
||||
if [ "$JOBS" -gt 1 ]; then
|
||||
"${cmd[@]}" 2>/dev/null &
|
||||
p=$!
|
||||
pids+=("$p")
|
||||
declare "file$p=$f"
|
||||
declare "file$p=${f#"$ARGON_DIR"/}"
|
||||
block_pids
|
||||
else
|
||||
if ! "${cmd[@]}" 2>/dev/null; then
|
||||
fail "$f"
|
||||
fail "${f#"$ARGON_DIR"/}"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
@ -166,9 +167,9 @@ done
|
|||
wait_all_pids
|
||||
|
||||
if [ "$failed" -ne 0 ]; then
|
||||
printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}"
|
||||
printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files"
|
||||
else
|
||||
printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}"
|
||||
printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files"
|
||||
fi
|
||||
printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info"
|
||||
|
||||
|
|
|
@ -69,6 +69,8 @@ if is_asm_enabled
|
|||
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
|
||||
elif host_machine.cpu_family() == 'riscv64'
|
||||
checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S')
|
||||
elif host_machine.cpu_family().startswith('x86')
|
||||
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
||||
endif
|
||||
|
@ -128,7 +130,7 @@ endforeach
|
|||
subdir('libfuzzer')
|
||||
|
||||
# seek stress test binary, depends on dav1d cli tool
|
||||
if get_option('enable_tools')
|
||||
if (get_option('enable_tools') and get_option('enable_seek_stress'))
|
||||
seek_stress_sources = files('seek_stress.c')
|
||||
seek_stress = executable('seek_stress',
|
||||
seek_stress_sources, rev_target,
|
||||
|
|
Загрузка…
Ссылка в новой задаче