Bug 1906715 - Update dav1d to 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 r=media-playback-reviewers,alwu

Differential Revision: https://phabricator.services.mozilla.com/D216426
This commit is contained in:
Chun-Min Chang 2024-07-15 16:14:09 +00:00
Родитель 47f16e40a1
Коммит a327f20ad9
12 изменённых файлов: 1961 добавлений и 370 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release # Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS" # Generally "version NNN", "tag SSS", "bookmark SSS"
release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00). release: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 (2024-06-26T11:20:43.000+02:00).
# Revision to pull in # Revision to pull in
# Must be a long or short commit SHA (long preferred) # Must be a long or short commit SHA (long preferred)
revision: 92f592ed104ba92ad35c781ee93f354525eef503 revision: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1
# The package's license, where possible using the mnemonic from # The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/ # https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */ /* auto-generated, do not edit */
#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503" #define DAV1D_VERSION "2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1"

83
third_party/dav1d/src/arm/64/mc_dotprod.S поставляемый
Просмотреть файл

@ -45,32 +45,33 @@ ENABLE_DOTPROD
#define LOOP_ALIGN 2 #define LOOP_ALIGN 2
// Lookup table used to help conversion of shifted 32-bit values to 8-bit. const h_tbl_neon_dotprod, align=4
.align 4 // Shuffle indices to permute horizontal samples in preparation for
L(hv_tbl_neon_dotprod): // input to SDOT instructions. The 8-tap horizontal convolution uses
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 // sample indices in the interval of [-3, 4] relative to the current
// sample position.
// Shuffle indices to permute horizontal samples in preparation for input to
// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
// interval of [-3, 4] relative to the current sample position.
.align 4
L(h_tbl_neon_dotprod):
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
// Vertical convolutions are also using SDOT instructions, where a 128-bit // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
// register contains a transposed 4x4 matrix of values. Subsequent iterations of #define OFFSET_CVT_32_8 48
// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
// iteration. These shuffle indices shift and merge this 4x4 matrix with the endconst
// values of a new line.
.align 4 const v_tbl_neon_dotprod, align=4
L(v_tbl_neon_dotprod): // Vertical convolutions are also using SDOT instructions, where a
// 128-bit register contains a transposed 4x4 matrix of values.
// Subsequent iterations of the vertical convolution can reuse the
// 3x4 sub-matrix from the previous loop iteration. These shuffle
// indices shift and merge this 4x4 matrix with the values of a new
// line.
.byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28
.byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19
.byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23
.byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27
.byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31
endconst
.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
@ -109,7 +110,7 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
.align JUMP_ALIGN .align JUMP_ALIGN
L(\type\()_8tap_v_\isa): L(\type\()_8tap_v_\isa):
madd \my, \my, w11, w10 madd \my, \my, w11, w10
ldr q6, L(v_tbl_neon_dotprod) movrel x13, v_tbl_neon_dotprod
sub \src, \src, \s_strd sub \src, \src, \s_strd
.ifc \isa, neon_dotprod .ifc \isa, neon_dotprod
.ifc \type, prep .ifc \type, prep
@ -121,12 +122,12 @@ L(\type\()_8tap_v_\isa):
.endif .endif
ubfx w11, \my, #7, #7 ubfx w11, \my, #7, #7
and \my, \my, #0x7F and \my, \my, #0x7F
ldr q28, L(v_tbl_neon_dotprod) + 16 ldp q6, q28, [x13]
cmp \h, #4 cmp \h, #4
csel \my, \my, w11, le csel \my, \my, w11, le
sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3
add \xmy, x12, \xmy, lsl #3 // subpel V filter address add \xmy, x12, \xmy, lsl #3 // subpel V filter address
ldr q29, L(v_tbl_neon_dotprod) + 32 ldr q29, [x13, #32]
.ifc \isa, neon_dotprod .ifc \isa, neon_dotprod
movi v5.16b, #128 movi v5.16b, #128
.endif .endif
@ -137,8 +138,7 @@ L(\type\()_8tap_v_\isa):
// .align JUMP_ALIGN // fallthrough // .align JUMP_ALIGN // fallthrough
160: // V - 16xN+ 160: // V - 16xN+
ldr q30, L(v_tbl_neon_dotprod) + 48 ldp q30, q31, [x13, #48]
ldr q31, L(v_tbl_neon_dotprod) + 64
.ifc \type, prep .ifc \type, prep
add \wd_strd, \w, \w add \wd_strd, \w, \w
.endif .endif
@ -676,12 +676,13 @@ L(\type\()_8tap_v_\isa):
L(\type\()_8tap_h_hv_\isa): L(\type\()_8tap_h_hv_\isa):
madd \mx, \mx, w11, w9 madd \mx, \mx, w11, w9
madd w14, \my, w11, w10 // for HV madd w14, \my, w11, w10 // for HV
ldr q28, L(h_tbl_neon_dotprod)
.ifc \isa, neon_dotprod .ifc \isa, neon_dotprod
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
dup v27.4s, w13 // put H overrides this dup v27.4s, w13 // put H overrides this
.endif .endif
movrel x13, h_tbl_neon_dotprod
sub \src, \src, #3 // src - 3 sub \src, \src, #3 // src - 3
ldr q28, [x13]
ubfx w9, \mx, #7, #7 ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7F and \mx, \mx, #0x7F
ubfx w11, w14, #7, #7 // for HV ubfx w11, w14, #7, #7 // for HV
@ -702,8 +703,8 @@ L(\type\()_8tap_h_hv_\isa):
mov x15, x30 mov x15, x30
ldr d7, [\xmy] ldr d7, [\xmy]
.ifc \type, put .ifc \type, put
ldr q25, L(hv_tbl_neon_dotprod) ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
.endif .endif // of 32b values to 8b
sxtl v7.8h, v7.8b sxtl v7.8h, v7.8b
cmp w10, SHARP1 cmp w10, SHARP1
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
@ -718,8 +719,7 @@ L(\type\()_8tap_h_hv_\isa):
// .align JUMP_ALIGN // fallthrough // .align JUMP_ALIGN // fallthrough
80: // HV8 - 8xN+ 80: // HV8 - 8xN+
ldr q29, L(h_tbl_neon_dotprod) + 16 ldp q29, q30, [x13, #16]
ldr q30, L(h_tbl_neon_dotprod) + 32
ldr d26, [\xmx] ldr d26, [\xmx]
.ifc \type, prep .ifc \type, prep
add \wd_strd, \w, \w add \wd_strd, \w, \w
@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa):
.align JUMP_ALIGN .align JUMP_ALIGN
40: // HV8 - 4xN 40: // HV8 - 4xN
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
add \src, \src, #2 add \src, \src, #2
bl L(\type\()_hv_filter4_\isa) bl L(\type\()_hv_filter4_\isa)
@ -930,7 +930,7 @@ L(\type\()_8tap_h_hv_\isa):
.ifc \type, put .ifc \type, put
.align JUMP_ALIGN .align JUMP_ALIGN
20: // HV8 - 2xN 20: // HV8 - 2xN
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
add \src, \src, #2 add \src, \src, #2
bl L(\type\()_hv_filter4_\isa) bl L(\type\()_hv_filter4_\isa)
@ -1005,13 +1005,11 @@ L(\type\()_6tap_hv_\isa):
// .align JUMP_ALIGN // fallthrough // .align JUMP_ALIGN // fallthrough
80: // HV6 - 8xN+ 80: // HV6 - 8xN+
ldr q29, L(h_tbl_neon_dotprod) + 16 ldp q29, q30, [x13, #16]
ldr q30, L(h_tbl_neon_dotprod) + 32
ldr d26, [\xmx] ldr d26, [\xmx]
.ifc \type, prep .ifc \type, prep
add \wd_strd, \w, \w add \wd_strd, \w, \w
.endif .endif
.align LOOP_ALIGN .align LOOP_ALIGN
81: 81:
mov \lsrc, \src mov \lsrc, \src
@ -1145,7 +1143,7 @@ L(\type\()_hv_filter4_\isa):
.align JUMP_ALIGN .align JUMP_ALIGN
40: // HV6 - 4xN 40: // HV6 - 4xN
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
add \src, \src, #2 add \src, \src, #2
bl L(\type\()_hv_filter4_\isa) bl L(\type\()_hv_filter4_\isa)
@ -1206,7 +1204,7 @@ L(\type\()_hv_filter4_\isa):
.ifc \type, put .ifc \type, put
.align JUMP_ALIGN .align JUMP_ALIGN
20: // HV6 - 2xN 20: // HV6 - 2xN
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
add \src, \src, #2 add \src, \src, #2
bl L(\type\()_hv_filter4_\isa) bl L(\type\()_hv_filter4_\isa)
@ -1284,7 +1282,7 @@ L(\type\()_8tap_h_\isa):
20: // H - 2xN 20: // H - 2xN
AARCH64_VALID_JUMP_TARGET AARCH64_VALID_JUMP_TARGET
add \src, \src, #2 add \src, \src, #2
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
.align LOOP_ALIGN .align LOOP_ALIGN
2: 2:
@ -1321,7 +1319,7 @@ L(\type\()_8tap_h_\isa):
40: // H - 4xN 40: // H - 4xN
AARCH64_VALID_JUMP_TARGET AARCH64_VALID_JUMP_TARGET
add \src, \src, #2 add \src, \src, #2
ldr s26, [\xmx, #2] ldur s26, [\xmx, #2]
.align LOOP_ALIGN .align LOOP_ALIGN
4: 4:
@ -1370,8 +1368,7 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN .align JUMP_ALIGN
80: // H - 8xN 80: // H - 8xN
AARCH64_VALID_JUMP_TARGET AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16 ldp q29, q30, [x13, #16]
ldr q30, L(h_tbl_neon_dotprod) + 32
ldr d26, [\xmx] ldr d26, [\xmx]
.align LOOP_ALIGN .align LOOP_ALIGN
@ -1436,14 +1433,13 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN .align JUMP_ALIGN
160: // H - 16xN 160: // H - 16xN
AARCH64_VALID_JUMP_TARGET AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16 ldp q29, q30, [x13, #16]
ldr q30, L(h_tbl_neon_dotprod) + 32
ldr d26, [\xmx] ldr d26, [\xmx]
.align LOOP_ALIGN .align LOOP_ALIGN
16: 16:
ldr q16, [\src] ldr q16, [\src]
ldr q17, [\src, #12] // avoid 2 register TBL for small cores ldur q17, [\src, #12] // avoid 2 register TBL for small cores
add \src, \src, \s_strd add \src, \src, \s_strd
.ifc \type\()_\isa, prep_neon_i8mm .ifc \type\()_\isa, prep_neon_i8mm
movi v6.4s, #0 movi v6.4s, #0
@ -1501,8 +1497,7 @@ L(\type\()_8tap_h_\isa):
640: 640:
1280: 1280:
AARCH64_VALID_JUMP_TARGET AARCH64_VALID_JUMP_TARGET
ldr q29, L(h_tbl_neon_dotprod) + 16 ldp q29, q30, [x13, #16]
ldr q30, L(h_tbl_neon_dotprod) + 32
ldr d26, [\xmx] ldr d26, [\xmx]
.ifc \type, put .ifc \type, put
sub \d_strd, \d_strd, \w, uxtw sub \d_strd, \d_strd, \w, uxtw
@ -1513,7 +1508,7 @@ L(\type\()_8tap_h_\isa):
.align LOOP_ALIGN .align LOOP_ALIGN
32: 32:
ldr q16, [\src] ldr q16, [\src]
ldr q17, [\src, #12] // avoid 2 register TBL for small cores ldur q17, [\src, #12] // avoid 2 register TBL for small cores
add \src, \src, #16 add \src, \src, #16
.ifc \type\()_\isa, prep_neon_i8mm .ifc \type\()_\isa, prep_neon_i8mm
movi v6.4s, #0 movi v6.4s, #0

46
third_party/dav1d/src/arm/cpu.c поставляемый
Просмотреть файл

@ -104,6 +104,52 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
return flags; return flags;
} }
#elif defined(__OpenBSD__)
#if ARCH_AARCH64
#include <machine/armreg.h>
#include <machine/cpu.h>
#include <sys/types.h>
#include <sys/sysctl.h>
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
#ifdef CPU_ID_AA64ISAR0
int mib[2];
uint64_t isar0;
uint64_t isar1;
size_t len;
mib[0] = CTL_MACHDEP;
mib[1] = CPU_ID_AA64ISAR0;
len = sizeof(isar0);
if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) {
if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL)
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
}
mib[0] = CTL_MACHDEP;
mib[1] = CPU_ID_AA64ISAR1;
len = sizeof(isar1);
if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) {
#ifdef ID_AA64ISAR1_I8MM_IMPL
if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL)
flags |= DAV1D_ARM_CPU_FLAG_I8MM;
#endif
}
#endif
return flags;
}
#else /* !ARCH_AARCH64 */
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
return flags;
}
#endif /* ARCH_AARCH64 */
#elif defined(_WIN32) #elif defined(_WIN32)
#include <windows.h> #include <windows.h>

5
third_party/dav1d/src/arm/itx.h поставляемый
Просмотреть файл

@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc,
int *const all_simd)
{
const unsigned flags = dav1d_get_cpu_flags(); const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
assign_itx1_fn (R, 64, 16, neon); assign_itx1_fn (R, 64, 16, neon);
assign_itx1_fn (R, 64, 32, neon); assign_itx1_fn (R, 64, 32, neon);
assign_itx1_fn ( , 64, 64, neon); assign_itx1_fn ( , 64, 64, neon);
*all_simd = 1;
} }

87
third_party/dav1d/src/itx_1d.c поставляемый
Просмотреть файл

@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[3 * stride] = CLIP(t0 - t3); c[3 * stride] = CLIP(t0 - t3);
} }
void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
inv_dct4_1d_internal_c(c, stride, min, max, 0); inv_dct4_1d_internal_c(c, stride, min, max, 0);
} }
@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[7 * stride] = CLIP(t0 - t7); c[7 * stride] = CLIP(t0 - t7);
} }
void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
inv_dct8_1d_internal_c(c, stride, min, max, 0); inv_dct8_1d_internal_c(c, stride, min, max, 0);
} }
@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[15 * stride] = CLIP(t0 - t15a); c[15 * stride] = CLIP(t0 - t15a);
} }
void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
inv_dct16_1d_internal_c(c, stride, min, max, 0); inv_dct16_1d_internal_c(c, stride, min, max, 0);
} }
@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
c[31 * stride] = CLIP(t0 - t31); c[31 * stride] = CLIP(t0 - t31);
} }
void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
inv_dct32_1d_internal_c(c, stride, min, max, 0); inv_dct32_1d_internal_c(c, stride, min, max, 0);
} }
void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
assert(stride > 0); assert(stride > 0);
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1); inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
} }
#define inv_adst_1d(sz) \ #define inv_adst_1d(sz) \
void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \ const int min, const int max) \
{ \ { \
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \ inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
} \ } \
void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \ const int min, const int max) \
{ \ { \
inv_adst##sz##_1d_internal_c(c, stride, min, max, \ inv_adst##sz##_1d_internal_c(c, stride, min, max, \
&c[(sz - 1) * stride], -stride); \ &c[(sz - 1) * stride], -stride); \
@ -980,8 +980,8 @@ inv_adst_1d(16)
#undef inv_adst_1d #undef inv_adst_1d
void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
assert(stride > 0); assert(stride > 0);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
} }
} }
void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
assert(stride > 0); assert(stride > 0);
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
c[stride * i] *= 2; c[stride * i] *= 2;
} }
void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
assert(stride > 0); assert(stride > 0);
for (int i = 0; i < 16; i++) { for (int i = 0; i < 16; i++) {
@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
} }
} }
void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride, static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max) const int min, const int max)
{ {
assert(stride > 0); assert(stride > 0);
for (int i = 0; i < 32; i++) for (int i = 0; i < 32; i++)
c[stride * i] *= 4; c[stride * i] *= 4;
} }
const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = {
[TX_4X4] = {
[DCT] = inv_dct4_1d_c,
[ADST] = inv_adst4_1d_c,
[FLIPADST] = inv_flipadst4_1d_c,
[IDENTITY] = inv_identity4_1d_c,
}, [TX_8X8] = {
[DCT] = inv_dct8_1d_c,
[ADST] = inv_adst8_1d_c,
[FLIPADST] = inv_flipadst8_1d_c,
[IDENTITY] = inv_identity8_1d_c,
}, [TX_16X16] = {
[DCT] = inv_dct16_1d_c,
[ADST] = inv_adst16_1d_c,
[FLIPADST] = inv_flipadst16_1d_c,
[IDENTITY] = inv_identity16_1d_c,
}, [TX_32X32] = {
[DCT] = inv_dct32_1d_c,
[IDENTITY] = inv_identity32_1d_c,
}, [TX_64X64] = {
[DCT] = inv_dct64_1d_c,
},
};
const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = {
[DCT_DCT] = { DCT, DCT },
[ADST_DCT] = { ADST, DCT },
[DCT_ADST] = { DCT, ADST },
[ADST_ADST] = { ADST, ADST },
[FLIPADST_DCT] = { FLIPADST, DCT },
[DCT_FLIPADST] = { DCT, FLIPADST },
[FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
[ADST_FLIPADST] = { ADST, FLIPADST },
[FLIPADST_ADST] = { FLIPADST, ADST },
[IDTX] = { IDENTITY, IDENTITY },
[V_DCT] = { DCT, IDENTITY },
[H_DCT] = { IDENTITY, DCT },
[V_ADST] = { ADST, IDENTITY },
[H_ADST] = { IDENTITY, ADST },
[V_FLIPADST] = { FLIPADST, IDENTITY },
[H_FLIPADST] = { IDENTITY, FLIPADST },
};
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
ARCH_AARCH64 || \ ARCH_AARCH64 || \
(ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \

30
third_party/dav1d/src/itx_1d.h поставляемый
Просмотреть файл

@ -28,31 +28,25 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include "src/levels.h"
#ifndef DAV1D_SRC_ITX_1D_H #ifndef DAV1D_SRC_ITX_1D_H
#define DAV1D_SRC_ITX_1D_H #define DAV1D_SRC_ITX_1D_H
enum Tx1dType {
DCT,
ADST,
IDENTITY,
FLIPADST,
N_TX_1D_TYPES,
};
#define decl_itx_1d_fn(name) \ #define decl_itx_1d_fn(name) \
void (name)(int32_t *c, ptrdiff_t stride, int min, int max) void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
typedef decl_itx_1d_fn(*itx_1d_fn); typedef decl_itx_1d_fn(*itx_1d_fn);
decl_itx_1d_fn(dav1d_inv_dct4_1d_c); EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES];
decl_itx_1d_fn(dav1d_inv_dct8_1d_c); EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2];
decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride); void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);

126
third_party/dav1d/src/itx_tmpl.c поставляемый
Просмотреть файл

@ -29,6 +29,7 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h>
#include <string.h> #include <string.h>
#include "common/attributes.h" #include "common/attributes.h"
@ -36,13 +37,17 @@
#include "src/itx.h" #include "src/itx.h"
#include "src/itx_1d.h" #include "src/itx_1d.h"
#include "src/scan.h"
#include "src/tables.h"
static NOINLINE void static NOINLINE void
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
const int eob, const int w, const int h, const int shift, const int eob, const /*enum RectTxfmSize*/ int tx, const int shift,
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn, const enum TxfmType txtp HIGHBD_DECL_SUFFIX)
const int has_dconly HIGHBD_DECL_SUFFIX)
{ {
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
const int w = 4 * t_dim->w, h = 4 * t_dim->h;
const int has_dconly = txtp == DCT_DCT;
assert(w >= 4 && w <= 64); assert(w >= 4 && w <= 64);
assert(h >= 4 && h <= 64); assert(h >= 4 && h <= 64);
assert(eob >= 0); assert(eob >= 0);
@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
return; return;
} }
const uint8_t *const txtps = dav1d_tx1d_types[txtp];
const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
const int sh = imin(h, 32), sw = imin(w, 32); const int sh = imin(h, 32), sw = imin(w, 32);
#if BITDEPTH == 8 #if BITDEPTH == 8
const int row_clip_min = INT16_MIN; const int row_clip_min = INT16_MIN;
@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
const int col_clip_max = ~col_clip_min; const int col_clip_max = ~col_clip_min;
int32_t tmp[64 * 64], *c = tmp; int32_t tmp[64 * 64], *c = tmp;
for (int y = 0; y < sh; y++, c += w) { int last_nonzero_col; // in first 1d itx
if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
last_nonzero_col = imin(sh - 1, eob);
} else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
last_nonzero_col = eob >> (t_dim->lw + 2);
} else {
last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
}
assert(last_nonzero_col < sh);
for (int y = 0; y <= last_nonzero_col; y++, c += w) {
if (is_rect2) if (is_rect2)
for (int x = 0; x < sw; x++) for (int x = 0; x < sw; x++)
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8; c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
c[x] = coeff[y + x * sh]; c[x] = coeff[y + x * sh];
first_1d_fn(c, 1, row_clip_min, row_clip_max); first_1d_fn(c, 1, row_clip_min, row_clip_max);
} }
if (last_nonzero_col + 1 < sh)
memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);
memset(coeff, 0, sizeof(*coeff) * sw * sh); memset(coeff, 0, sizeof(*coeff) * sw * sh);
for (int i = 0; i < w * sh; i++) for (int i = 0; i < w * sh; i++)
@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4)); dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
} }
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \ #define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \
static void \ static void \
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const ptrdiff_t stride, \ const ptrdiff_t stride, \
@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const int eob \ const int eob \
HIGHBD_DECL_SUFFIX) \ HIGHBD_DECL_SUFFIX) \
{ \ { \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \ inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \ HIGHBD_TAIL_SUFFIX); \
has_dconly HIGHBD_TAIL_SUFFIX); \
} }
#define inv_txfm_fn64(w, h, shift) \ #define inv_txfm_fn64(pfx, w, h, shift) \
inv_txfm_fn(dct, dct, w, h, shift, 1) inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift)
#define inv_txfm_fn32(w, h, shift) \ #define inv_txfm_fn32(pfx, w, h, shift) \
inv_txfm_fn64(w, h, shift) \ inv_txfm_fn64(pfx, w, h, shift) \
inv_txfm_fn(identity, identity, w, h, shift, 0) inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift)
#define inv_txfm_fn16(w, h, shift) \ #define inv_txfm_fn16(pfx, w, h, shift) \
inv_txfm_fn32(w, h, shift) \ inv_txfm_fn32(pfx, w, h, shift) \
inv_txfm_fn(adst, dct, w, h, shift, 0) \ inv_txfm_fn(adst, dct, ADST_DCT, pfx, w, h, shift) \
inv_txfm_fn(dct, adst, w, h, shift, 0) \ inv_txfm_fn(dct, adst, DCT_ADST, pfx, w, h, shift) \
inv_txfm_fn(adst, adst, w, h, shift, 0) \ inv_txfm_fn(adst, adst, ADST_ADST, pfx, w, h, shift) \
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \ inv_txfm_fn(dct, flipadst, DCT_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \ inv_txfm_fn(flipadst, dct, FLIPADST_DCT, pfx, w, h, shift) \
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \ inv_txfm_fn(adst, flipadst, ADST_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \ inv_txfm_fn(flipadst, adst, FLIPADST_ADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \ inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(identity, dct, w, h, shift, 0) \ inv_txfm_fn(identity, dct, H_DCT, pfx, w, h, shift) \
inv_txfm_fn(dct, identity, w, h, shift, 0) \ inv_txfm_fn(dct, identity, V_DCT, pfx, w, h, shift) \
#define inv_txfm_fn84(w, h, shift) \ #define inv_txfm_fn84(pfx, w, h, shift) \
inv_txfm_fn16(w, h, shift) \ inv_txfm_fn16(pfx, w, h, shift) \
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \ inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \ inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \
inv_txfm_fn(identity, adst, w, h, shift, 0) \ inv_txfm_fn(identity, adst, H_ADST, pfx, w, h, shift) \
inv_txfm_fn(adst, identity, w, h, shift, 0) \ inv_txfm_fn(adst, identity, V_ADST, pfx, w, h, shift) \
inv_txfm_fn84( 4, 4, 0) inv_txfm_fn84( , 4, 4, 0)
inv_txfm_fn84( 4, 8, 0) inv_txfm_fn84(R, 4, 8, 0)
inv_txfm_fn84( 4, 16, 1) inv_txfm_fn84(R, 4, 16, 1)
inv_txfm_fn84( 8, 4, 0) inv_txfm_fn84(R, 8, 4, 0)
inv_txfm_fn84( 8, 8, 1) inv_txfm_fn84( , 8, 8, 1)
inv_txfm_fn84( 8, 16, 1) inv_txfm_fn84(R, 8, 16, 1)
inv_txfm_fn32( 8, 32, 2) inv_txfm_fn32(R, 8, 32, 2)
inv_txfm_fn84(16, 4, 1) inv_txfm_fn84(R, 16, 4, 1)
inv_txfm_fn84(16, 8, 1) inv_txfm_fn84(R, 16, 8, 1)
inv_txfm_fn16(16, 16, 2) inv_txfm_fn16( , 16, 16, 2)
inv_txfm_fn32(16, 32, 1) inv_txfm_fn32(R, 16, 32, 1)
inv_txfm_fn64(16, 64, 2) inv_txfm_fn64(R, 16, 64, 2)
inv_txfm_fn32(32, 8, 2) inv_txfm_fn32(R, 32, 8, 2)
inv_txfm_fn32(32, 16, 1) inv_txfm_fn32(R, 32, 16, 1)
inv_txfm_fn32(32, 32, 2) inv_txfm_fn32( , 32, 32, 2)
inv_txfm_fn64(32, 64, 1) inv_txfm_fn64(R, 32, 64, 1)
inv_txfm_fn64(64, 16, 2) inv_txfm_fn64(R, 64, 16, 2)
inv_txfm_fn64(64, 32, 1) inv_txfm_fn64(R, 64, 32, 1)
inv_txfm_fn64(64, 64, 2) inv_txfm_fn64( , 64, 64, 2)
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
ARCH_AARCH64 || \ ARCH_AARCH64 || \
@ -267,9 +285,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
assign_itx_all_fn64(64, 32, R); assign_itx_all_fn64(64, 32, R);
assign_itx_all_fn64(64, 64, ); assign_itx_all_fn64(64, 64, );
int all_simd = 0;
#if HAVE_ASM #if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM #if ARCH_AARCH64 || ARCH_ARM
itx_dsp_init_arm(c, bpc); itx_dsp_init_arm(c, bpc, &all_simd);
#endif #endif
#if ARCH_LOONGARCH64 #if ARCH_LOONGARCH64
itx_dsp_init_loongarch(c, bpc); itx_dsp_init_loongarch(c, bpc);
@ -278,7 +297,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
itx_dsp_init_riscv(c, bpc); itx_dsp_init_riscv(c, bpc);
#endif #endif
#if ARCH_X86 #if ARCH_X86
itx_dsp_init_x86(c, bpc); itx_dsp_init_x86(c, bpc, &all_simd);
#endif #endif
#endif #endif
if (!all_simd)
dav1d_init_last_nonzero_col_from_eob_tables();
} }

76
third_party/dav1d/src/scan.c поставляемый
Просмотреть файл

@ -28,7 +28,10 @@
#include "config.h" #include "config.h"
#include "common/attributes.h" #include "common/attributes.h"
#include "common/intops.h"
#include "src/scan.h" #include "src/scan.h"
#include "src/thread.h"
static const uint16_t ALIGN(scan_4x4[], 32) = { static const uint16_t ALIGN(scan_4x4[], 32) = {
0, 4, 1, 2, 0, 4, 1, 2,
@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
[RTX_16X64] = scan_16x32, [RTX_16X64] = scan_16x32,
[RTX_64X16] = scan_32x16, [RTX_64X16] = scan_32x16,
}; };
static uint8_t last_nonzero_col_from_eob_4x4[16];
static uint8_t last_nonzero_col_from_eob_8x8[64];
static uint8_t last_nonzero_col_from_eob_16x16[256];
static uint8_t last_nonzero_col_from_eob_32x32[1024];
static uint8_t last_nonzero_col_from_eob_4x8[32];
static uint8_t last_nonzero_col_from_eob_8x4[32];
static uint8_t last_nonzero_col_from_eob_8x16[128];
static uint8_t last_nonzero_col_from_eob_16x8[128];
static uint8_t last_nonzero_col_from_eob_16x32[512];
static uint8_t last_nonzero_col_from_eob_32x16[512];
static uint8_t last_nonzero_col_from_eob_4x16[64];
static uint8_t last_nonzero_col_from_eob_16x4[64];
static uint8_t last_nonzero_col_from_eob_8x32[256];
static uint8_t last_nonzero_col_from_eob_32x8[256];
static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob,
const uint16_t *const scan, const int w, const int h)
{
int max_col = 0;
for (int y = 0, n = 0; y < h; y++) {
for (int x = 0; x < w; x++, n++) {
const int rc = scan[n];
const int rcx = rc & (h - 1);
max_col = imax(max_col, rcx);
last_nonzero_col_from_eob[n] = max_col;
}
}
}
static COLD void init_internal(void) {
init_tbl(last_nonzero_col_from_eob_4x4, scan_4x4, 4, 4);
init_tbl(last_nonzero_col_from_eob_8x8, scan_8x8, 8, 8);
init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
init_tbl(last_nonzero_col_from_eob_4x8, scan_4x8, 4, 8);
init_tbl(last_nonzero_col_from_eob_8x4, scan_8x4, 8, 4);
init_tbl(last_nonzero_col_from_eob_8x16, scan_8x16, 8, 16);
init_tbl(last_nonzero_col_from_eob_16x8, scan_16x8, 16, 8);
init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
init_tbl(last_nonzero_col_from_eob_4x16, scan_4x16, 4, 16);
init_tbl(last_nonzero_col_from_eob_16x4, scan_16x4, 16, 4);
init_tbl(last_nonzero_col_from_eob_8x32, scan_8x32, 8, 32);
init_tbl(last_nonzero_col_from_eob_32x8, scan_32x8, 32, 8);
}
COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
static pthread_once_t initted = PTHREAD_ONCE_INIT;
pthread_once(&initted, init_internal);
}
const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = {
[ TX_4X4 ] = last_nonzero_col_from_eob_4x4,
[ TX_8X8 ] = last_nonzero_col_from_eob_8x8,
[ TX_16X16] = last_nonzero_col_from_eob_16x16,
[ TX_32X32] = last_nonzero_col_from_eob_32x32,
[ TX_64X64] = last_nonzero_col_from_eob_32x32,
[RTX_4X8 ] = last_nonzero_col_from_eob_4x8,
[RTX_8X4 ] = last_nonzero_col_from_eob_8x4,
[RTX_8X16 ] = last_nonzero_col_from_eob_8x16,
[RTX_16X8 ] = last_nonzero_col_from_eob_16x8,
[RTX_16X32] = last_nonzero_col_from_eob_16x32,
[RTX_32X16] = last_nonzero_col_from_eob_32x16,
[RTX_32X64] = last_nonzero_col_from_eob_32x32,
[RTX_64X32] = last_nonzero_col_from_eob_32x32,
[RTX_4X16 ] = last_nonzero_col_from_eob_4x16,
[RTX_16X4 ] = last_nonzero_col_from_eob_16x4,
[RTX_8X32 ] = last_nonzero_col_from_eob_8x32,
[RTX_32X8 ] = last_nonzero_col_from_eob_32x8,
[RTX_16X64] = last_nonzero_col_from_eob_16x32,
[RTX_64X16] = last_nonzero_col_from_eob_32x16,
};

3
third_party/dav1d/src/scan.h поставляемый
Просмотреть файл

@ -33,5 +33,8 @@
#include "src/levels.h" #include "src/levels.h"
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES]; EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES];
void dav1d_init_last_nonzero_col_from_eob_tables(void);
#endif /* DAV1D_SRC_SCAN_H */ #endif /* DAV1D_SRC_SCAN_H */

6
third_party/dav1d/src/x86/itx.h поставляемый
Просмотреть файл

@ -107,7 +107,9 @@ decl_itx_fns(ssse3);
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
const int bpc, int *const all_simd)
{
#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_fn (R, 64, 16, ssse3); assign_itx1_fn (R, 64, 16, ssse3);
assign_itx1_fn (R, 64, 32, ssse3); assign_itx1_fn (R, 64, 32, ssse3);
assign_itx1_fn ( , 64, 64, ssse3); assign_itx1_fn ( , 64, 64, ssse3);
*all_simd = 1;
#endif #endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_fn (R, 64, 16, sse4); assign_itx1_fn (R, 64, 16, sse4);
assign_itx1_fn (R, 64, 32, sse4); assign_itx1_fn (R, 64, 32, sse4);
assign_itx1_fn (, 64, 64, sse4); assign_itx1_fn (, 64, 64, sse4);
*all_simd = 1;
} }
#endif #endif

1863
third_party/dav1d/src/x86/mc16_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу