Bug 1906715 - Update dav1d to 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 r=media-playback-reviewers,alwu

Differential Revision: https://phabricator.services.mozilla.com/D216426
This commit is contained in:
Chun-Min Chang 2024-07-15 16:14:09 +00:00
Родитель 47f16e40a1
Коммит a327f20ad9
12 изменённых файлов: 1961 добавлений и 370 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00).
release: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 (2024-06-26T11:20:43.000+02:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 92f592ed104ba92ad35c781ee93f354525eef503
revision: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503"
#define DAV1D_VERSION "2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1"

83
third_party/dav1d/src/arm/64/mc_dotprod.S поставляемый
Просмотреть файл

@ -45,32 +45,33 @@ ENABLE_DOTPROD
#define LOOP_ALIGN 2
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
.align 4
L(hv_tbl_neon_dotprod):
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
// Shuffle indices to permute horizontal samples in preparation for input to
// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
// interval of [-3, 4] relative to the current sample position.
.align 4
L(h_tbl_neon_dotprod):
const h_tbl_neon_dotprod, align=4
// Shuffle indices to permute horizontal samples in preparation for
// input to SDOT instructions. The 8-tap horizontal convolution uses
// sample indices in the interval of [-3, 4] relative to the current
// sample position.
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
// Vertical convolutions are also using SDOT instructions, where a 128-bit
// register contains a transposed 4x4 matrix of values. Subsequent iterations of
// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
// iteration. These shuffle indices shift and merge this 4x4 matrix with the
// values of a new line.
.align 4
L(v_tbl_neon_dotprod):
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
#define OFFSET_CVT_32_8 48
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
endconst
const v_tbl_neon_dotprod, align=4
// Vertical convolutions are also using SDOT instructions, where a
// 128-bit register contains a transposed 4x4 matrix of values.
// Subsequent iterations of the vertical convolution can reuse the
// 3x4 sub-matrix from the previous loop iteration. These shuffle
// indices shift and merge this 4x4 matrix with the values of a new
// line.
.byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28
.byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19
.byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23
.byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27
.byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31
endconst
.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
@ -109,7 +110,7 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
.align JUMP_ALIGN
L(\type\()_8tap_v_\isa):
madd \my, \my, w11, w10
ldr q6, L(v_tbl_neon_dotprod)
movrel x13, v_tbl_neon_dotprod
sub \src, \src, \s_strd
.ifc \isa, neon_dotprod
.ifc \type, prep
@ -121,12 +122,12 @@ L(\type\()_8tap_v_\isa):
.endif
ubfx w11, \my, #7, #7
and \my, \my, #0x7F
ldr q28, L(v_tbl_neon_dotprod) + 16
ldp q6, q28, [x13]
cmp \h, #4
csel \my, \my, w11, le
sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3
add \xmy, x12, \xmy, lsl #3 // subpel V filter address
ldr q29, L(v_tbl_neon_dotprod) + 32
ldr q29, [x13, #32]
.ifc \isa, neon_dotprod
movi v5.16b, #128
.endif
@ -137,8 +138,7 @@ L(\type\()_8tap_v_\isa):
// .align JUMP_ALIGN // fallthrough
160: // V - 16xN+
ldr q30, L(v_tbl_neon_dotprod) + 48
ldr q31, L(v_tbl_neon_dotprod) + 64
ldp q30, q31, [x13, #48]
.ifc \type, prep
add \wd_strd, \w, \w
.endif
@ -676,12 +676,13 @@ L(\type\()_8tap_v_\isa):
L(\type\()_8tap_h_hv_\isa):
madd \mx, \mx, w11, w9
madd w14, \my, w11, w10 // for HV
ldr q28, L(h_tbl_neon_dotprod)
.ifc \isa, neon_dotprod
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
dup v27.4s, w13 // put H overrides this
.endif
movrel x13, h_tbl_neon_dotprod
sub \src, \src, #3 // src - 3
ldr q28, [x13]
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7F
ubfx w11, w14, #7, #7 // for HV
@ -702,8 +703,8 @@ L(\type\()_8tap_h_hv_\isa):
mov x15, x30
ldr d7, [\xmy]
.ifc \type, put
ldr q25, L(hv_tbl_neon_dotprod)
.endif
ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
.endif // of 32b values to 8b
sxtl v7.8h, v7.8b
cmp w10, SHARP1
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
@ -718,8 +719,7 @@ L(\type\()_8tap_h_hv_\isa):
// .align JUMP_ALIGN // fallthrough
80: // HV8 - 8xN+
ldr q29, L(h_tbl_neon_dotprod) + 16
ldr q30, L(h_tbl_neon_dotprod) + 32
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \type, prep
add \wd_strd, \w, \w
@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa):
.align JUMP_ALIGN
40: // HV8 - 4xN
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
add \src, \src, #2
bl L(\type\()_hv_filter4_\isa)
@ -930,7 +930,7 @@ L(\type\()_8tap_h_hv_\isa):
.ifc \type, put
.align JUMP_ALIGN
20: // HV8 - 2xN
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
add \src, \src, #2
bl L(\type\()_hv_filter4_\isa)
@ -1005,13 +1005,11 @@ L(\type\()_6tap_hv_\isa):
// .align JUMP_ALIGN // fallthrough
80: // HV6 - 8xN+
ldr q29, L(h_tbl_neon_dotprod) + 16
ldr q30, L(h_tbl_neon_dotprod) + 32
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \type, prep
add \wd_strd, \w, \w
.endif
.align LOOP_ALIGN
81:
mov \lsrc, \src
@ -1145,7 +1143,7 @@ L(\type\()_hv_filter4_\isa):
.align JUMP_ALIGN
40: // HV6 - 4xN
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
add \src, \src, #2
bl L(\type\()_hv_filter4_\isa)
@ -1206,7 +1204,7 @@ L(\type\()_hv_filter4_\isa):
.ifc \type, put
.align JUMP_ALIGN
20: // HV6 - 2xN
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
add \src, \src, #2
bl L(\type\()_hv_filter4_\isa)
@ -1284,7 +1282,7 @@ L(\type\()_8tap_h_\isa):
20: // H - 2xN
AARCH64_VALID_JUMP_TARGET
add \src, \src, #2
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
.align LOOP_ALIGN
2:
@ -1321,7 +1319,7 @@ L(\type\()_8tap_h_\isa):
40: // H - 4xN
AARCH64_VALID_JUMP_TARGET
add \src, \src, #2
ldr s26, [\xmx, #2]
ldur s26, [\xmx, #2]
.align LOOP_ALIGN
4:
@ -1370,8 +1368,7 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN
80: // H - 8xN
AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16
ldr q30, L(h_tbl_neon_dotprod) + 32
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.align LOOP_ALIGN
@ -1436,14 +1433,13 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN
160: // H - 16xN
AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16
ldr q30, L(h_tbl_neon_dotprod) + 32
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.align LOOP_ALIGN
16:
ldr q16, [\src]
ldr q17, [\src, #12] // avoid 2 register TBL for small cores
ldur q17, [\src, #12] // avoid 2 register TBL for small cores
add \src, \src, \s_strd
.ifc \type\()_\isa, prep_neon_i8mm
movi v6.4s, #0
@ -1501,8 +1497,7 @@ L(\type\()_8tap_h_\isa):
640:
1280:
AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16
ldr q30, L(h_tbl_neon_dotprod) + 32
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \type, put
sub \d_strd, \d_strd, \w, uxtw
@ -1513,7 +1508,7 @@ L(\type\()_8tap_h_\isa):
.align LOOP_ALIGN
32:
ldr q16, [\src]
ldr q17, [\src, #12] // avoid 2 register TBL for small cores
ldur q17, [\src, #12] // avoid 2 register TBL for small cores
add \src, \src, #16
.ifc \type\()_\isa, prep_neon_i8mm
movi v6.4s, #0

46
third_party/dav1d/src/arm/cpu.c поставляемый
Просмотреть файл

@ -104,6 +104,52 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
return flags;
}
#elif defined(__OpenBSD__)
#if ARCH_AARCH64
#include <machine/armreg.h>
#include <machine/cpu.h>
#include <sys/types.h>
#include <sys/sysctl.h>
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
#ifdef CPU_ID_AA64ISAR0
int mib[2];
uint64_t isar0;
uint64_t isar1;
size_t len;
mib[0] = CTL_MACHDEP;
mib[1] = CPU_ID_AA64ISAR0;
len = sizeof(isar0);
if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) {
if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL)
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
}
mib[0] = CTL_MACHDEP;
mib[1] = CPU_ID_AA64ISAR1;
len = sizeof(isar1);
if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) {
#ifdef ID_AA64ISAR1_I8MM_IMPL
if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL)
flags |= DAV1D_ARM_CPU_FLAG_I8MM;
#endif
}
#endif
return flags;
}
#else /* !ARCH_AARCH64 */
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
return flags;
}
#endif /* ARCH_AARCH64 */
#elif defined(_WIN32)
#include <windows.h>

5
third_party/dav1d/src/arm/itx.h поставляемый
Просмотреть файл

@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc,
int *const all_simd)
{
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
assign_itx1_fn (R, 64, 16, neon);
assign_itx1_fn (R, 64, 32, neon);
assign_itx1_fn ( , 64, 64, neon);
*all_simd = 1;
}

87
third_party/dav1d/src/itx_1d.c поставляемый
Просмотреть файл

@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[3 * stride] = CLIP(t0 - t3);
}
void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
inv_dct4_1d_internal_c(c, stride, min, max, 0);
}
@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[7 * stride] = CLIP(t0 - t7);
}
void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
inv_dct8_1d_internal_c(c, stride, min, max, 0);
}
@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[15 * stride] = CLIP(t0 - t15a);
}
void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
inv_dct16_1d_internal_c(c, stride, min, max, 0);
}
@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[31 * stride] = CLIP(t0 - t31);
}
void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
inv_dct32_1d_internal_c(c, stride, min, max, 0);
}
void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
}
#define inv_adst_1d(sz) \
void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
{ \
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
} \
void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
{ \
inv_adst##sz##_1d_internal_c(c, stride, min, max, \
&c[(sz - 1) * stride], -stride); \
@ -980,8 +980,8 @@ inv_adst_1d(16)
#undef inv_adst_1d
void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 4; i++) {
@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
}
}
void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 8; i++)
c[stride * i] *= 2;
}
void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 16; i++) {
@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
}
}
void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 32; i++)
c[stride * i] *= 4;
}
const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = {
[TX_4X4] = {
[DCT] = inv_dct4_1d_c,
[ADST] = inv_adst4_1d_c,
[FLIPADST] = inv_flipadst4_1d_c,
[IDENTITY] = inv_identity4_1d_c,
}, [TX_8X8] = {
[DCT] = inv_dct8_1d_c,
[ADST] = inv_adst8_1d_c,
[FLIPADST] = inv_flipadst8_1d_c,
[IDENTITY] = inv_identity8_1d_c,
}, [TX_16X16] = {
[DCT] = inv_dct16_1d_c,
[ADST] = inv_adst16_1d_c,
[FLIPADST] = inv_flipadst16_1d_c,
[IDENTITY] = inv_identity16_1d_c,
}, [TX_32X32] = {
[DCT] = inv_dct32_1d_c,
[IDENTITY] = inv_identity32_1d_c,
}, [TX_64X64] = {
[DCT] = inv_dct64_1d_c,
},
};
const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = {
[DCT_DCT] = { DCT, DCT },
[ADST_DCT] = { ADST, DCT },
[DCT_ADST] = { DCT, ADST },
[ADST_ADST] = { ADST, ADST },
[FLIPADST_DCT] = { FLIPADST, DCT },
[DCT_FLIPADST] = { DCT, FLIPADST },
[FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
[ADST_FLIPADST] = { ADST, FLIPADST },
[FLIPADST_ADST] = { FLIPADST, ADST },
[IDTX] = { IDENTITY, IDENTITY },
[V_DCT] = { DCT, IDENTITY },
[H_DCT] = { IDENTITY, DCT },
[V_ADST] = { ADST, IDENTITY },
[H_ADST] = { IDENTITY, ADST },
[V_FLIPADST] = { FLIPADST, IDENTITY },
[H_FLIPADST] = { IDENTITY, FLIPADST },
};
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
ARCH_AARCH64 || \
(ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \

30
third_party/dav1d/src/itx_1d.h поставляемый
Просмотреть файл

@ -28,31 +28,25 @@
#include <stddef.h>
#include <stdint.h>
#include "src/levels.h"
#ifndef DAV1D_SRC_ITX_1D_H
#define DAV1D_SRC_ITX_1D_H
enum Tx1dType {
DCT,
ADST,
IDENTITY,
FLIPADST,
N_TX_1D_TYPES,
};
#define decl_itx_1d_fn(name) \
void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
typedef decl_itx_1d_fn(*itx_1d_fn);
decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES];
EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2];
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);

126
third_party/dav1d/src/itx_tmpl.c поставляемый
Просмотреть файл

@ -29,6 +29,7 @@
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "common/attributes.h"
@ -36,13 +37,17 @@
#include "src/itx.h"
#include "src/itx_1d.h"
#include "src/scan.h"
#include "src/tables.h"
static NOINLINE void
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
const int eob, const int w, const int h, const int shift,
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
const int has_dconly HIGHBD_DECL_SUFFIX)
const int eob, const /*enum RectTxfmSize*/ int tx, const int shift,
const enum TxfmType txtp HIGHBD_DECL_SUFFIX)
{
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
const int w = 4 * t_dim->w, h = 4 * t_dim->h;
const int has_dconly = txtp == DCT_DCT;
assert(w >= 4 && w <= 64);
assert(h >= 4 && h <= 64);
assert(eob >= 0);
@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
return;
}
const uint8_t *const txtps = dav1d_tx1d_types[txtp];
const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
const int sh = imin(h, 32), sw = imin(w, 32);
#if BITDEPTH == 8
const int row_clip_min = INT16_MIN;
@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
const int col_clip_max = ~col_clip_min;
int32_t tmp[64 * 64], *c = tmp;
for (int y = 0; y < sh; y++, c += w) {
int last_nonzero_col; // in first 1d itx
if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
last_nonzero_col = imin(sh - 1, eob);
} else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
last_nonzero_col = eob >> (t_dim->lw + 2);
} else {
last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
}
assert(last_nonzero_col < sh);
for (int y = 0; y <= last_nonzero_col; y++, c += w) {
if (is_rect2)
for (int x = 0; x < sw; x++)
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
c[x] = coeff[y + x * sh];
first_1d_fn(c, 1, row_clip_min, row_clip_max);
}
if (last_nonzero_col + 1 < sh)
memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);
memset(coeff, 0, sizeof(*coeff) * sw * sh);
for (int i = 0; i < w * sh; i++)
@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
}
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
#define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \
static void \
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const ptrdiff_t stride, \
@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const int eob \
HIGHBD_DECL_SUFFIX) \
{ \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
has_dconly HIGHBD_TAIL_SUFFIX); \
inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
HIGHBD_TAIL_SUFFIX); \
}
#define inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(dct, dct, w, h, shift, 1)
#define inv_txfm_fn64(pfx, w, h, shift) \
inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift)
#define inv_txfm_fn32(w, h, shift) \
inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(identity, identity, w, h, shift, 0)
#define inv_txfm_fn32(pfx, w, h, shift) \
inv_txfm_fn64(pfx, w, h, shift) \
inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift)
#define inv_txfm_fn16(w, h, shift) \
inv_txfm_fn32(w, h, shift) \
inv_txfm_fn(adst, dct, w, h, shift, 0) \
inv_txfm_fn(dct, adst, w, h, shift, 0) \
inv_txfm_fn(adst, adst, w, h, shift, 0) \
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
inv_txfm_fn(identity, dct, w, h, shift, 0) \
inv_txfm_fn(dct, identity, w, h, shift, 0) \
#define inv_txfm_fn16(pfx, w, h, shift) \
inv_txfm_fn32(pfx, w, h, shift) \
inv_txfm_fn(adst, dct, ADST_DCT, pfx, w, h, shift) \
inv_txfm_fn(dct, adst, DCT_ADST, pfx, w, h, shift) \
inv_txfm_fn(adst, adst, ADST_ADST, pfx, w, h, shift) \
inv_txfm_fn(dct, flipadst, DCT_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, dct, FLIPADST_DCT, pfx, w, h, shift) \
inv_txfm_fn(adst, flipadst, ADST_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, adst, FLIPADST_ADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(identity, dct, H_DCT, pfx, w, h, shift) \
inv_txfm_fn(dct, identity, V_DCT, pfx, w, h, shift) \
#define inv_txfm_fn84(w, h, shift) \
inv_txfm_fn16(w, h, shift) \
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
inv_txfm_fn(identity, adst, w, h, shift, 0) \
inv_txfm_fn(adst, identity, w, h, shift, 0) \
#define inv_txfm_fn84(pfx, w, h, shift) \
inv_txfm_fn16(pfx, w, h, shift) \
inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(identity, adst, H_ADST, pfx, w, h, shift) \
inv_txfm_fn(adst, identity, V_ADST, pfx, w, h, shift) \
inv_txfm_fn84( 4, 4, 0)
inv_txfm_fn84( 4, 8, 0)
inv_txfm_fn84( 4, 16, 1)
inv_txfm_fn84( 8, 4, 0)
inv_txfm_fn84( 8, 8, 1)
inv_txfm_fn84( 8, 16, 1)
inv_txfm_fn32( 8, 32, 2)
inv_txfm_fn84(16, 4, 1)
inv_txfm_fn84(16, 8, 1)
inv_txfm_fn16(16, 16, 2)
inv_txfm_fn32(16, 32, 1)
inv_txfm_fn64(16, 64, 2)
inv_txfm_fn32(32, 8, 2)
inv_txfm_fn32(32, 16, 1)
inv_txfm_fn32(32, 32, 2)
inv_txfm_fn64(32, 64, 1)
inv_txfm_fn64(64, 16, 2)
inv_txfm_fn64(64, 32, 1)
inv_txfm_fn64(64, 64, 2)
inv_txfm_fn84( , 4, 4, 0)
inv_txfm_fn84(R, 4, 8, 0)
inv_txfm_fn84(R, 4, 16, 1)
inv_txfm_fn84(R, 8, 4, 0)
inv_txfm_fn84( , 8, 8, 1)
inv_txfm_fn84(R, 8, 16, 1)
inv_txfm_fn32(R, 8, 32, 2)
inv_txfm_fn84(R, 16, 4, 1)
inv_txfm_fn84(R, 16, 8, 1)
inv_txfm_fn16( , 16, 16, 2)
inv_txfm_fn32(R, 16, 32, 1)
inv_txfm_fn64(R, 16, 64, 2)
inv_txfm_fn32(R, 32, 8, 2)
inv_txfm_fn32(R, 32, 16, 1)
inv_txfm_fn32( , 32, 32, 2)
inv_txfm_fn64(R, 32, 64, 1)
inv_txfm_fn64(R, 64, 16, 2)
inv_txfm_fn64(R, 64, 32, 1)
inv_txfm_fn64( , 64, 64, 2)
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
ARCH_AARCH64 || \
@ -267,9 +285,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
assign_itx_all_fn64(64, 32, R);
assign_itx_all_fn64(64, 64, );
int all_simd = 0;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
itx_dsp_init_arm(c, bpc);
itx_dsp_init_arm(c, bpc, &all_simd);
#endif
#if ARCH_LOONGARCH64
itx_dsp_init_loongarch(c, bpc);
@ -278,7 +297,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
itx_dsp_init_riscv(c, bpc);
#endif
#if ARCH_X86
itx_dsp_init_x86(c, bpc);
itx_dsp_init_x86(c, bpc, &all_simd);
#endif
#endif
if (!all_simd)
dav1d_init_last_nonzero_col_from_eob_tables();
}

76
third_party/dav1d/src/scan.c поставляемый
Просмотреть файл

@ -28,7 +28,10 @@
#include "config.h"
#include "common/attributes.h"
#include "common/intops.h"
#include "src/scan.h"
#include "src/thread.h"
static const uint16_t ALIGN(scan_4x4[], 32) = {
0, 4, 1, 2,
@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
[RTX_16X64] = scan_16x32,
[RTX_64X16] = scan_32x16,
};
static uint8_t last_nonzero_col_from_eob_4x4[16];
static uint8_t last_nonzero_col_from_eob_8x8[64];
static uint8_t last_nonzero_col_from_eob_16x16[256];
static uint8_t last_nonzero_col_from_eob_32x32[1024];
static uint8_t last_nonzero_col_from_eob_4x8[32];
static uint8_t last_nonzero_col_from_eob_8x4[32];
static uint8_t last_nonzero_col_from_eob_8x16[128];
static uint8_t last_nonzero_col_from_eob_16x8[128];
static uint8_t last_nonzero_col_from_eob_16x32[512];
static uint8_t last_nonzero_col_from_eob_32x16[512];
static uint8_t last_nonzero_col_from_eob_4x16[64];
static uint8_t last_nonzero_col_from_eob_16x4[64];
static uint8_t last_nonzero_col_from_eob_8x32[256];
static uint8_t last_nonzero_col_from_eob_32x8[256];
static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob,
const uint16_t *const scan, const int w, const int h)
{
int max_col = 0;
for (int y = 0, n = 0; y < h; y++) {
for (int x = 0; x < w; x++, n++) {
const int rc = scan[n];
const int rcx = rc & (h - 1);
max_col = imax(max_col, rcx);
last_nonzero_col_from_eob[n] = max_col;
}
}
}
static COLD void init_internal(void) {
init_tbl(last_nonzero_col_from_eob_4x4, scan_4x4, 4, 4);
init_tbl(last_nonzero_col_from_eob_8x8, scan_8x8, 8, 8);
init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
init_tbl(last_nonzero_col_from_eob_4x8, scan_4x8, 4, 8);
init_tbl(last_nonzero_col_from_eob_8x4, scan_8x4, 8, 4);
init_tbl(last_nonzero_col_from_eob_8x16, scan_8x16, 8, 16);
init_tbl(last_nonzero_col_from_eob_16x8, scan_16x8, 16, 8);
init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
init_tbl(last_nonzero_col_from_eob_4x16, scan_4x16, 4, 16);
init_tbl(last_nonzero_col_from_eob_16x4, scan_16x4, 16, 4);
init_tbl(last_nonzero_col_from_eob_8x32, scan_8x32, 8, 32);
init_tbl(last_nonzero_col_from_eob_32x8, scan_32x8, 32, 8);
}
COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
static pthread_once_t initted = PTHREAD_ONCE_INIT;
pthread_once(&initted, init_internal);
}
const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = {
[ TX_4X4 ] = last_nonzero_col_from_eob_4x4,
[ TX_8X8 ] = last_nonzero_col_from_eob_8x8,
[ TX_16X16] = last_nonzero_col_from_eob_16x16,
[ TX_32X32] = last_nonzero_col_from_eob_32x32,
[ TX_64X64] = last_nonzero_col_from_eob_32x32,
[RTX_4X8 ] = last_nonzero_col_from_eob_4x8,
[RTX_8X4 ] = last_nonzero_col_from_eob_8x4,
[RTX_8X16 ] = last_nonzero_col_from_eob_8x16,
[RTX_16X8 ] = last_nonzero_col_from_eob_16x8,
[RTX_16X32] = last_nonzero_col_from_eob_16x32,
[RTX_32X16] = last_nonzero_col_from_eob_32x16,
[RTX_32X64] = last_nonzero_col_from_eob_32x32,
[RTX_64X32] = last_nonzero_col_from_eob_32x32,
[RTX_4X16 ] = last_nonzero_col_from_eob_4x16,
[RTX_16X4 ] = last_nonzero_col_from_eob_16x4,
[RTX_8X32 ] = last_nonzero_col_from_eob_8x32,
[RTX_32X8 ] = last_nonzero_col_from_eob_32x8,
[RTX_16X64] = last_nonzero_col_from_eob_16x32,
[RTX_64X16] = last_nonzero_col_from_eob_32x16,
};

3
third_party/dav1d/src/scan.h поставляемый
Просмотреть файл

@ -33,5 +33,8 @@
#include "src/levels.h"
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES];
void dav1d_init_last_nonzero_col_from_eob_tables(void);
#endif /* DAV1D_SRC_SCAN_H */

6
third_party/dav1d/src/x86/itx.h поставляемый
Просмотреть файл

@ -107,7 +107,9 @@ decl_itx_fns(ssse3);
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
const int bpc, int *const all_simd)
{
#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_fn (R, 64, 16, ssse3);
assign_itx1_fn (R, 64, 32, ssse3);
assign_itx1_fn ( , 64, 64, ssse3);
*all_simd = 1;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_fn (R, 64, 16, sse4);
assign_itx1_fn (R, 64, 32, sse4);
assign_itx1_fn (, 64, 64, sse4);
*all_simd = 1;
}
#endif

1863
third_party/dav1d/src/x86/mc16_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу