зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1906715 - Update dav1d to 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 r=media-playback-reviewers,alwu
Differential Revision: https://phabricator.services.mozilla.com/D216426
This commit is contained in:
Родитель
47f16e40a1
Коммит
a327f20ad9
|
@ -20,11 +20,11 @@ origin:
|
||||||
|
|
||||||
# Human-readable identifier for this version/release
|
# Human-readable identifier for this version/release
|
||||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||||
release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00).
|
release: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 (2024-06-26T11:20:43.000+02:00).
|
||||||
|
|
||||||
# Revision to pull in
|
# Revision to pull in
|
||||||
# Must be a long or short commit SHA (long preferred)
|
# Must be a long or short commit SHA (long preferred)
|
||||||
revision: 92f592ed104ba92ad35c781ee93f354525eef503
|
revision: 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1
|
||||||
|
|
||||||
# The package's license, where possible using the mnemonic from
|
# The package's license, where possible using the mnemonic from
|
||||||
# https://spdx.org/licenses/
|
# https://spdx.org/licenses/
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
/* auto-generated, do not edit */
|
/* auto-generated, do not edit */
|
||||||
#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503"
|
#define DAV1D_VERSION "2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1"
|
||||||
|
|
|
@ -45,32 +45,33 @@ ENABLE_DOTPROD
|
||||||
#define LOOP_ALIGN 2
|
#define LOOP_ALIGN 2
|
||||||
|
|
||||||
|
|
||||||
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
|
const h_tbl_neon_dotprod, align=4
|
||||||
.align 4
|
// Shuffle indices to permute horizontal samples in preparation for
|
||||||
L(hv_tbl_neon_dotprod):
|
// input to SDOT instructions. The 8-tap horizontal convolution uses
|
||||||
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
// sample indices in the interval of [-3, 4] relative to the current
|
||||||
|
// sample position.
|
||||||
// Shuffle indices to permute horizontal samples in preparation for input to
|
|
||||||
// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
|
|
||||||
// interval of [-3, 4] relative to the current sample position.
|
|
||||||
.align 4
|
|
||||||
L(h_tbl_neon_dotprod):
|
|
||||||
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
|
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
|
||||||
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
|
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
|
||||||
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
|
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
|
||||||
|
|
||||||
// Vertical convolutions are also using SDOT instructions, where a 128-bit
|
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
|
||||||
// register contains a transposed 4x4 matrix of values. Subsequent iterations of
|
#define OFFSET_CVT_32_8 48
|
||||||
// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
|
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
||||||
// iteration. These shuffle indices shift and merge this 4x4 matrix with the
|
endconst
|
||||||
// values of a new line.
|
|
||||||
.align 4
|
const v_tbl_neon_dotprod, align=4
|
||||||
L(v_tbl_neon_dotprod):
|
// Vertical convolutions are also using SDOT instructions, where a
|
||||||
|
// 128-bit register contains a transposed 4x4 matrix of values.
|
||||||
|
// Subsequent iterations of the vertical convolution can reuse the
|
||||||
|
// 3x4 sub-matrix from the previous loop iteration. These shuffle
|
||||||
|
// indices shift and merge this 4x4 matrix with the values of a new
|
||||||
|
// line.
|
||||||
.byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28
|
.byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28
|
||||||
.byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19
|
.byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19
|
||||||
.byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23
|
.byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23
|
||||||
.byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27
|
.byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27
|
||||||
.byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31
|
.byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31
|
||||||
|
endconst
|
||||||
|
|
||||||
|
|
||||||
.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
|
.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
|
||||||
|
@ -109,7 +110,7 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
L(\type\()_8tap_v_\isa):
|
L(\type\()_8tap_v_\isa):
|
||||||
madd \my, \my, w11, w10
|
madd \my, \my, w11, w10
|
||||||
ldr q6, L(v_tbl_neon_dotprod)
|
movrel x13, v_tbl_neon_dotprod
|
||||||
sub \src, \src, \s_strd
|
sub \src, \src, \s_strd
|
||||||
.ifc \isa, neon_dotprod
|
.ifc \isa, neon_dotprod
|
||||||
.ifc \type, prep
|
.ifc \type, prep
|
||||||
|
@ -121,12 +122,12 @@ L(\type\()_8tap_v_\isa):
|
||||||
.endif
|
.endif
|
||||||
ubfx w11, \my, #7, #7
|
ubfx w11, \my, #7, #7
|
||||||
and \my, \my, #0x7F
|
and \my, \my, #0x7F
|
||||||
ldr q28, L(v_tbl_neon_dotprod) + 16
|
ldp q6, q28, [x13]
|
||||||
cmp \h, #4
|
cmp \h, #4
|
||||||
csel \my, \my, w11, le
|
csel \my, \my, w11, le
|
||||||
sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3
|
sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3
|
||||||
add \xmy, x12, \xmy, lsl #3 // subpel V filter address
|
add \xmy, x12, \xmy, lsl #3 // subpel V filter address
|
||||||
ldr q29, L(v_tbl_neon_dotprod) + 32
|
ldr q29, [x13, #32]
|
||||||
.ifc \isa, neon_dotprod
|
.ifc \isa, neon_dotprod
|
||||||
movi v5.16b, #128
|
movi v5.16b, #128
|
||||||
.endif
|
.endif
|
||||||
|
@ -137,8 +138,7 @@ L(\type\()_8tap_v_\isa):
|
||||||
|
|
||||||
// .align JUMP_ALIGN // fallthrough
|
// .align JUMP_ALIGN // fallthrough
|
||||||
160: // V - 16xN+
|
160: // V - 16xN+
|
||||||
ldr q30, L(v_tbl_neon_dotprod) + 48
|
ldp q30, q31, [x13, #48]
|
||||||
ldr q31, L(v_tbl_neon_dotprod) + 64
|
|
||||||
.ifc \type, prep
|
.ifc \type, prep
|
||||||
add \wd_strd, \w, \w
|
add \wd_strd, \w, \w
|
||||||
.endif
|
.endif
|
||||||
|
@ -676,12 +676,13 @@ L(\type\()_8tap_v_\isa):
|
||||||
L(\type\()_8tap_h_hv_\isa):
|
L(\type\()_8tap_h_hv_\isa):
|
||||||
madd \mx, \mx, w11, w9
|
madd \mx, \mx, w11, w9
|
||||||
madd w14, \my, w11, w10 // for HV
|
madd w14, \my, w11, w10 // for HV
|
||||||
ldr q28, L(h_tbl_neon_dotprod)
|
|
||||||
.ifc \isa, neon_dotprod
|
.ifc \isa, neon_dotprod
|
||||||
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
|
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
|
||||||
dup v27.4s, w13 // put H overrides this
|
dup v27.4s, w13 // put H overrides this
|
||||||
.endif
|
.endif
|
||||||
|
movrel x13, h_tbl_neon_dotprod
|
||||||
sub \src, \src, #3 // src - 3
|
sub \src, \src, #3 // src - 3
|
||||||
|
ldr q28, [x13]
|
||||||
ubfx w9, \mx, #7, #7
|
ubfx w9, \mx, #7, #7
|
||||||
and \mx, \mx, #0x7F
|
and \mx, \mx, #0x7F
|
||||||
ubfx w11, w14, #7, #7 // for HV
|
ubfx w11, w14, #7, #7 // for HV
|
||||||
|
@ -702,8 +703,8 @@ L(\type\()_8tap_h_hv_\isa):
|
||||||
mov x15, x30
|
mov x15, x30
|
||||||
ldr d7, [\xmy]
|
ldr d7, [\xmy]
|
||||||
.ifc \type, put
|
.ifc \type, put
|
||||||
ldr q25, L(hv_tbl_neon_dotprod)
|
ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
|
||||||
.endif
|
.endif // of 32b values to 8b
|
||||||
sxtl v7.8h, v7.8b
|
sxtl v7.8h, v7.8b
|
||||||
cmp w10, SHARP1
|
cmp w10, SHARP1
|
||||||
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
|
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
|
||||||
|
@ -718,8 +719,7 @@ L(\type\()_8tap_h_hv_\isa):
|
||||||
|
|
||||||
// .align JUMP_ALIGN // fallthrough
|
// .align JUMP_ALIGN // fallthrough
|
||||||
80: // HV8 - 8xN+
|
80: // HV8 - 8xN+
|
||||||
ldr q29, L(h_tbl_neon_dotprod) + 16
|
ldp q29, q30, [x13, #16]
|
||||||
ldr q30, L(h_tbl_neon_dotprod) + 32
|
|
||||||
ldr d26, [\xmx]
|
ldr d26, [\xmx]
|
||||||
.ifc \type, prep
|
.ifc \type, prep
|
||||||
add \wd_strd, \w, \w
|
add \wd_strd, \w, \w
|
||||||
|
@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa):
|
||||||
|
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
40: // HV8 - 4xN
|
40: // HV8 - 4xN
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
|
|
||||||
bl L(\type\()_hv_filter4_\isa)
|
bl L(\type\()_hv_filter4_\isa)
|
||||||
|
@ -930,7 +930,7 @@ L(\type\()_8tap_h_hv_\isa):
|
||||||
.ifc \type, put
|
.ifc \type, put
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
20: // HV8 - 2xN
|
20: // HV8 - 2xN
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
|
|
||||||
bl L(\type\()_hv_filter4_\isa)
|
bl L(\type\()_hv_filter4_\isa)
|
||||||
|
@ -1005,13 +1005,11 @@ L(\type\()_6tap_hv_\isa):
|
||||||
|
|
||||||
// .align JUMP_ALIGN // fallthrough
|
// .align JUMP_ALIGN // fallthrough
|
||||||
80: // HV6 - 8xN+
|
80: // HV6 - 8xN+
|
||||||
ldr q29, L(h_tbl_neon_dotprod) + 16
|
ldp q29, q30, [x13, #16]
|
||||||
ldr q30, L(h_tbl_neon_dotprod) + 32
|
|
||||||
ldr d26, [\xmx]
|
ldr d26, [\xmx]
|
||||||
.ifc \type, prep
|
.ifc \type, prep
|
||||||
add \wd_strd, \w, \w
|
add \wd_strd, \w, \w
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
81:
|
81:
|
||||||
mov \lsrc, \src
|
mov \lsrc, \src
|
||||||
|
@ -1145,7 +1143,7 @@ L(\type\()_hv_filter4_\isa):
|
||||||
|
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
40: // HV6 - 4xN
|
40: // HV6 - 4xN
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
|
|
||||||
bl L(\type\()_hv_filter4_\isa)
|
bl L(\type\()_hv_filter4_\isa)
|
||||||
|
@ -1206,7 +1204,7 @@ L(\type\()_hv_filter4_\isa):
|
||||||
.ifc \type, put
|
.ifc \type, put
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
20: // HV6 - 2xN
|
20: // HV6 - 2xN
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
|
|
||||||
bl L(\type\()_hv_filter4_\isa)
|
bl L(\type\()_hv_filter4_\isa)
|
||||||
|
@ -1284,7 +1282,7 @@ L(\type\()_8tap_h_\isa):
|
||||||
20: // H - 2xN
|
20: // H - 2xN
|
||||||
AARCH64_VALID_JUMP_TARGET
|
AARCH64_VALID_JUMP_TARGET
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
|
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
2:
|
2:
|
||||||
|
@ -1321,7 +1319,7 @@ L(\type\()_8tap_h_\isa):
|
||||||
40: // H - 4xN
|
40: // H - 4xN
|
||||||
AARCH64_VALID_JUMP_TARGET
|
AARCH64_VALID_JUMP_TARGET
|
||||||
add \src, \src, #2
|
add \src, \src, #2
|
||||||
ldr s26, [\xmx, #2]
|
ldur s26, [\xmx, #2]
|
||||||
|
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
4:
|
4:
|
||||||
|
@ -1370,8 +1368,7 @@ L(\type\()_8tap_h_\isa):
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
80: // H - 8xN
|
80: // H - 8xN
|
||||||
AARCH64_VALID_JUMP_TARGET
|
AARCH64_VALID_JUMP_TARGET
|
||||||
ldr q29, L(h_tbl_neon_dotprod) + 16
|
ldp q29, q30, [x13, #16]
|
||||||
ldr q30, L(h_tbl_neon_dotprod) + 32
|
|
||||||
ldr d26, [\xmx]
|
ldr d26, [\xmx]
|
||||||
|
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
|
@ -1436,14 +1433,13 @@ L(\type\()_8tap_h_\isa):
|
||||||
.align JUMP_ALIGN
|
.align JUMP_ALIGN
|
||||||
160: // H - 16xN
|
160: // H - 16xN
|
||||||
AARCH64_VALID_JUMP_TARGET
|
AARCH64_VALID_JUMP_TARGET
|
||||||
ldr q29, L(h_tbl_neon_dotprod) + 16
|
ldp q29, q30, [x13, #16]
|
||||||
ldr q30, L(h_tbl_neon_dotprod) + 32
|
|
||||||
ldr d26, [\xmx]
|
ldr d26, [\xmx]
|
||||||
|
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
16:
|
16:
|
||||||
ldr q16, [\src]
|
ldr q16, [\src]
|
||||||
ldr q17, [\src, #12] // avoid 2 register TBL for small cores
|
ldur q17, [\src, #12] // avoid 2 register TBL for small cores
|
||||||
add \src, \src, \s_strd
|
add \src, \src, \s_strd
|
||||||
.ifc \type\()_\isa, prep_neon_i8mm
|
.ifc \type\()_\isa, prep_neon_i8mm
|
||||||
movi v6.4s, #0
|
movi v6.4s, #0
|
||||||
|
@ -1501,8 +1497,7 @@ L(\type\()_8tap_h_\isa):
|
||||||
640:
|
640:
|
||||||
1280:
|
1280:
|
||||||
AARCH64_VALID_JUMP_TARGET
|
AARCH64_VALID_JUMP_TARGET
|
||||||
ldr q29, L(h_tbl_neon_dotprod) + 16
|
ldp q29, q30, [x13, #16]
|
||||||
ldr q30, L(h_tbl_neon_dotprod) + 32
|
|
||||||
ldr d26, [\xmx]
|
ldr d26, [\xmx]
|
||||||
.ifc \type, put
|
.ifc \type, put
|
||||||
sub \d_strd, \d_strd, \w, uxtw
|
sub \d_strd, \d_strd, \w, uxtw
|
||||||
|
@ -1513,7 +1508,7 @@ L(\type\()_8tap_h_\isa):
|
||||||
.align LOOP_ALIGN
|
.align LOOP_ALIGN
|
||||||
32:
|
32:
|
||||||
ldr q16, [\src]
|
ldr q16, [\src]
|
||||||
ldr q17, [\src, #12] // avoid 2 register TBL for small cores
|
ldur q17, [\src, #12] // avoid 2 register TBL for small cores
|
||||||
add \src, \src, #16
|
add \src, \src, #16
|
||||||
.ifc \type\()_\isa, prep_neon_i8mm
|
.ifc \type\()_\isa, prep_neon_i8mm
|
||||||
movi v6.4s, #0
|
movi v6.4s, #0
|
||||||
|
|
|
@ -104,6 +104,52 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(__OpenBSD__)
|
||||||
|
|
||||||
|
#if ARCH_AARCH64
|
||||||
|
#include <machine/armreg.h>
|
||||||
|
#include <machine/cpu.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
|
||||||
|
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||||
|
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||||
|
|
||||||
|
#ifdef CPU_ID_AA64ISAR0
|
||||||
|
int mib[2];
|
||||||
|
uint64_t isar0;
|
||||||
|
uint64_t isar1;
|
||||||
|
size_t len;
|
||||||
|
|
||||||
|
mib[0] = CTL_MACHDEP;
|
||||||
|
mib[1] = CPU_ID_AA64ISAR0;
|
||||||
|
len = sizeof(isar0);
|
||||||
|
if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) {
|
||||||
|
if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL)
|
||||||
|
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
|
||||||
|
}
|
||||||
|
|
||||||
|
mib[0] = CTL_MACHDEP;
|
||||||
|
mib[1] = CPU_ID_AA64ISAR1;
|
||||||
|
len = sizeof(isar1);
|
||||||
|
if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) {
|
||||||
|
#ifdef ID_AA64ISAR1_I8MM_IMPL
|
||||||
|
if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL)
|
||||||
|
flags |= DAV1D_ARM_CPU_FLAG_I8MM;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#else /* !ARCH_AARCH64 */
|
||||||
|
|
||||||
|
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||||
|
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#endif /* ARCH_AARCH64 */
|
||||||
|
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
|
||||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
|
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
|
||||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
|
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
|
||||||
|
|
||||||
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
|
static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc,
|
||||||
|
int *const all_simd)
|
||||||
|
{
|
||||||
const unsigned flags = dav1d_get_cpu_flags();
|
const unsigned flags = dav1d_get_cpu_flags();
|
||||||
|
|
||||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||||
|
@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
|
||||||
assign_itx1_fn (R, 64, 16, neon);
|
assign_itx1_fn (R, 64, 16, neon);
|
||||||
assign_itx1_fn (R, 64, 32, neon);
|
assign_itx1_fn (R, 64, 32, neon);
|
||||||
assign_itx1_fn ( , 64, 64, neon);
|
assign_itx1_fn ( , 64, 64, neon);
|
||||||
|
*all_simd = 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
c[3 * stride] = CLIP(t0 - t3);
|
c[3 * stride] = CLIP(t0 - t3);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
inv_dct4_1d_internal_c(c, stride, min, max, 0);
|
inv_dct4_1d_internal_c(c, stride, min, max, 0);
|
||||||
}
|
}
|
||||||
|
@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
c[7 * stride] = CLIP(t0 - t7);
|
c[7 * stride] = CLIP(t0 - t7);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
inv_dct8_1d_internal_c(c, stride, min, max, 0);
|
inv_dct8_1d_internal_c(c, stride, min, max, 0);
|
||||||
}
|
}
|
||||||
|
@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
c[15 * stride] = CLIP(t0 - t15a);
|
c[15 * stride] = CLIP(t0 - t15a);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
inv_dct16_1d_internal_c(c, stride, min, max, 0);
|
inv_dct16_1d_internal_c(c, stride, min, max, 0);
|
||||||
}
|
}
|
||||||
|
@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
c[31 * stride] = CLIP(t0 - t31);
|
c[31 * stride] = CLIP(t0 - t31);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
inv_dct32_1d_internal_c(c, stride, min, max, 0);
|
inv_dct32_1d_internal_c(c, stride, min, max, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
assert(stride > 0);
|
assert(stride > 0);
|
||||||
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
|
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
|
||||||
|
@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
|
||||||
}
|
}
|
||||||
|
|
||||||
#define inv_adst_1d(sz) \
|
#define inv_adst_1d(sz) \
|
||||||
void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
||||||
const int min, const int max) \
|
const int min, const int max) \
|
||||||
{ \
|
{ \
|
||||||
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
|
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
|
||||||
} \
|
} \
|
||||||
void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
||||||
const int min, const int max) \
|
const int min, const int max) \
|
||||||
{ \
|
{ \
|
||||||
inv_adst##sz##_1d_internal_c(c, stride, min, max, \
|
inv_adst##sz##_1d_internal_c(c, stride, min, max, \
|
||||||
&c[(sz - 1) * stride], -stride); \
|
&c[(sz - 1) * stride], -stride); \
|
||||||
|
@ -980,8 +980,8 @@ inv_adst_1d(16)
|
||||||
|
|
||||||
#undef inv_adst_1d
|
#undef inv_adst_1d
|
||||||
|
|
||||||
void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
assert(stride > 0);
|
assert(stride > 0);
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
|
@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
assert(stride > 0);
|
assert(stride > 0);
|
||||||
for (int i = 0; i < 8; i++)
|
for (int i = 0; i < 8; i++)
|
||||||
c[stride * i] *= 2;
|
c[stride * i] *= 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
assert(stride > 0);
|
assert(stride > 0);
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
|
@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||||
const int min, const int max)
|
const int min, const int max)
|
||||||
{
|
{
|
||||||
assert(stride > 0);
|
assert(stride > 0);
|
||||||
for (int i = 0; i < 32; i++)
|
for (int i = 0; i < 32; i++)
|
||||||
c[stride * i] *= 4;
|
c[stride * i] *= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = {
|
||||||
|
[TX_4X4] = {
|
||||||
|
[DCT] = inv_dct4_1d_c,
|
||||||
|
[ADST] = inv_adst4_1d_c,
|
||||||
|
[FLIPADST] = inv_flipadst4_1d_c,
|
||||||
|
[IDENTITY] = inv_identity4_1d_c,
|
||||||
|
}, [TX_8X8] = {
|
||||||
|
[DCT] = inv_dct8_1d_c,
|
||||||
|
[ADST] = inv_adst8_1d_c,
|
||||||
|
[FLIPADST] = inv_flipadst8_1d_c,
|
||||||
|
[IDENTITY] = inv_identity8_1d_c,
|
||||||
|
}, [TX_16X16] = {
|
||||||
|
[DCT] = inv_dct16_1d_c,
|
||||||
|
[ADST] = inv_adst16_1d_c,
|
||||||
|
[FLIPADST] = inv_flipadst16_1d_c,
|
||||||
|
[IDENTITY] = inv_identity16_1d_c,
|
||||||
|
}, [TX_32X32] = {
|
||||||
|
[DCT] = inv_dct32_1d_c,
|
||||||
|
[IDENTITY] = inv_identity32_1d_c,
|
||||||
|
}, [TX_64X64] = {
|
||||||
|
[DCT] = inv_dct64_1d_c,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = {
|
||||||
|
[DCT_DCT] = { DCT, DCT },
|
||||||
|
[ADST_DCT] = { ADST, DCT },
|
||||||
|
[DCT_ADST] = { DCT, ADST },
|
||||||
|
[ADST_ADST] = { ADST, ADST },
|
||||||
|
[FLIPADST_DCT] = { FLIPADST, DCT },
|
||||||
|
[DCT_FLIPADST] = { DCT, FLIPADST },
|
||||||
|
[FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
|
||||||
|
[ADST_FLIPADST] = { ADST, FLIPADST },
|
||||||
|
[FLIPADST_ADST] = { FLIPADST, ADST },
|
||||||
|
[IDTX] = { IDENTITY, IDENTITY },
|
||||||
|
[V_DCT] = { DCT, IDENTITY },
|
||||||
|
[H_DCT] = { IDENTITY, DCT },
|
||||||
|
[V_ADST] = { ADST, IDENTITY },
|
||||||
|
[H_ADST] = { IDENTITY, ADST },
|
||||||
|
[V_FLIPADST] = { FLIPADST, IDENTITY },
|
||||||
|
[H_FLIPADST] = { IDENTITY, FLIPADST },
|
||||||
|
};
|
||||||
|
|
||||||
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
|
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
|
||||||
ARCH_AARCH64 || \
|
ARCH_AARCH64 || \
|
||||||
(ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
|
(ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
|
||||||
|
|
|
@ -28,31 +28,25 @@
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "src/levels.h"
|
||||||
|
|
||||||
#ifndef DAV1D_SRC_ITX_1D_H
|
#ifndef DAV1D_SRC_ITX_1D_H
|
||||||
#define DAV1D_SRC_ITX_1D_H
|
#define DAV1D_SRC_ITX_1D_H
|
||||||
|
|
||||||
|
enum Tx1dType {
|
||||||
|
DCT,
|
||||||
|
ADST,
|
||||||
|
IDENTITY,
|
||||||
|
FLIPADST,
|
||||||
|
N_TX_1D_TYPES,
|
||||||
|
};
|
||||||
|
|
||||||
#define decl_itx_1d_fn(name) \
|
#define decl_itx_1d_fn(name) \
|
||||||
void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
|
void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
|
||||||
typedef decl_itx_1d_fn(*itx_1d_fn);
|
typedef decl_itx_1d_fn(*itx_1d_fn);
|
||||||
|
|
||||||
decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
|
EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES];
|
||||||
decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
|
EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2];
|
||||||
decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
|
|
||||||
|
|
||||||
decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
|
|
||||||
|
|
||||||
decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
|
|
||||||
|
|
||||||
decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
|
|
||||||
decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
|
|
||||||
|
|
||||||
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
|
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "common/attributes.h"
|
#include "common/attributes.h"
|
||||||
|
@ -36,13 +37,17 @@
|
||||||
|
|
||||||
#include "src/itx.h"
|
#include "src/itx.h"
|
||||||
#include "src/itx_1d.h"
|
#include "src/itx_1d.h"
|
||||||
|
#include "src/scan.h"
|
||||||
|
#include "src/tables.h"
|
||||||
|
|
||||||
static NOINLINE void
|
static NOINLINE void
|
||||||
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||||
const int eob, const int w, const int h, const int shift,
|
const int eob, const /*enum RectTxfmSize*/ int tx, const int shift,
|
||||||
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
|
const enum TxfmType txtp HIGHBD_DECL_SUFFIX)
|
||||||
const int has_dconly HIGHBD_DECL_SUFFIX)
|
|
||||||
{
|
{
|
||||||
|
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
|
||||||
|
const int w = 4 * t_dim->w, h = 4 * t_dim->h;
|
||||||
|
const int has_dconly = txtp == DCT_DCT;
|
||||||
assert(w >= 4 && w <= 64);
|
assert(w >= 4 && w <= 64);
|
||||||
assert(h >= 4 && h <= 64);
|
assert(h >= 4 && h <= 64);
|
||||||
assert(eob >= 0);
|
assert(eob >= 0);
|
||||||
|
@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const uint8_t *const txtps = dav1d_tx1d_types[txtp];
|
||||||
|
const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
|
||||||
|
const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
|
||||||
const int sh = imin(h, 32), sw = imin(w, 32);
|
const int sh = imin(h, 32), sw = imin(w, 32);
|
||||||
#if BITDEPTH == 8
|
#if BITDEPTH == 8
|
||||||
const int row_clip_min = INT16_MIN;
|
const int row_clip_min = INT16_MIN;
|
||||||
|
@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||||
const int col_clip_max = ~col_clip_min;
|
const int col_clip_max = ~col_clip_min;
|
||||||
|
|
||||||
int32_t tmp[64 * 64], *c = tmp;
|
int32_t tmp[64 * 64], *c = tmp;
|
||||||
for (int y = 0; y < sh; y++, c += w) {
|
int last_nonzero_col; // in first 1d itx
|
||||||
|
if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
|
||||||
|
last_nonzero_col = imin(sh - 1, eob);
|
||||||
|
} else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
|
||||||
|
last_nonzero_col = eob >> (t_dim->lw + 2);
|
||||||
|
} else {
|
||||||
|
last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
|
||||||
|
}
|
||||||
|
assert(last_nonzero_col < sh);
|
||||||
|
for (int y = 0; y <= last_nonzero_col; y++, c += w) {
|
||||||
if (is_rect2)
|
if (is_rect2)
|
||||||
for (int x = 0; x < sw; x++)
|
for (int x = 0; x < sw; x++)
|
||||||
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
|
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
|
||||||
|
@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||||
c[x] = coeff[y + x * sh];
|
c[x] = coeff[y + x * sh];
|
||||||
first_1d_fn(c, 1, row_clip_min, row_clip_max);
|
first_1d_fn(c, 1, row_clip_min, row_clip_max);
|
||||||
}
|
}
|
||||||
|
if (last_nonzero_col + 1 < sh)
|
||||||
|
memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);
|
||||||
|
|
||||||
memset(coeff, 0, sizeof(*coeff) * sw * sh);
|
memset(coeff, 0, sizeof(*coeff) * sw * sh);
|
||||||
for (int i = 0; i < w * sh; i++)
|
for (int i = 0; i < w * sh; i++)
|
||||||
|
@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||||
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
|
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
|
||||||
}
|
}
|
||||||
|
|
||||||
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
|
#define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \
|
||||||
static void \
|
static void \
|
||||||
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
||||||
const ptrdiff_t stride, \
|
const ptrdiff_t stride, \
|
||||||
|
@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
||||||
const int eob \
|
const int eob \
|
||||||
HIGHBD_DECL_SUFFIX) \
|
HIGHBD_DECL_SUFFIX) \
|
||||||
{ \
|
{ \
|
||||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
|
inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
|
||||||
dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
|
HIGHBD_TAIL_SUFFIX); \
|
||||||
has_dconly HIGHBD_TAIL_SUFFIX); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define inv_txfm_fn64(w, h, shift) \
|
#define inv_txfm_fn64(pfx, w, h, shift) \
|
||||||
inv_txfm_fn(dct, dct, w, h, shift, 1)
|
inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift)
|
||||||
|
|
||||||
#define inv_txfm_fn32(w, h, shift) \
|
#define inv_txfm_fn32(pfx, w, h, shift) \
|
||||||
inv_txfm_fn64(w, h, shift) \
|
inv_txfm_fn64(pfx, w, h, shift) \
|
||||||
inv_txfm_fn(identity, identity, w, h, shift, 0)
|
inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift)
|
||||||
|
|
||||||
#define inv_txfm_fn16(w, h, shift) \
|
#define inv_txfm_fn16(pfx, w, h, shift) \
|
||||||
inv_txfm_fn32(w, h, shift) \
|
inv_txfm_fn32(pfx, w, h, shift) \
|
||||||
inv_txfm_fn(adst, dct, w, h, shift, 0) \
|
inv_txfm_fn(adst, dct, ADST_DCT, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(dct, adst, w, h, shift, 0) \
|
inv_txfm_fn(dct, adst, DCT_ADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(adst, adst, w, h, shift, 0) \
|
inv_txfm_fn(adst, adst, ADST_ADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
|
inv_txfm_fn(dct, flipadst, DCT_FLIPADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
|
inv_txfm_fn(flipadst, dct, FLIPADST_DCT, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
|
inv_txfm_fn(adst, flipadst, ADST_FLIPADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
|
inv_txfm_fn(flipadst, adst, FLIPADST_ADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
|
inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(identity, dct, w, h, shift, 0) \
|
inv_txfm_fn(identity, dct, H_DCT, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(dct, identity, w, h, shift, 0) \
|
inv_txfm_fn(dct, identity, V_DCT, pfx, w, h, shift) \
|
||||||
|
|
||||||
#define inv_txfm_fn84(w, h, shift) \
|
#define inv_txfm_fn84(pfx, w, h, shift) \
|
||||||
inv_txfm_fn16(w, h, shift) \
|
inv_txfm_fn16(pfx, w, h, shift) \
|
||||||
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
|
inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
|
inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(identity, adst, w, h, shift, 0) \
|
inv_txfm_fn(identity, adst, H_ADST, pfx, w, h, shift) \
|
||||||
inv_txfm_fn(adst, identity, w, h, shift, 0) \
|
inv_txfm_fn(adst, identity, V_ADST, pfx, w, h, shift) \
|
||||||
|
|
||||||
inv_txfm_fn84( 4, 4, 0)
|
inv_txfm_fn84( , 4, 4, 0)
|
||||||
inv_txfm_fn84( 4, 8, 0)
|
inv_txfm_fn84(R, 4, 8, 0)
|
||||||
inv_txfm_fn84( 4, 16, 1)
|
inv_txfm_fn84(R, 4, 16, 1)
|
||||||
inv_txfm_fn84( 8, 4, 0)
|
inv_txfm_fn84(R, 8, 4, 0)
|
||||||
inv_txfm_fn84( 8, 8, 1)
|
inv_txfm_fn84( , 8, 8, 1)
|
||||||
inv_txfm_fn84( 8, 16, 1)
|
inv_txfm_fn84(R, 8, 16, 1)
|
||||||
inv_txfm_fn32( 8, 32, 2)
|
inv_txfm_fn32(R, 8, 32, 2)
|
||||||
inv_txfm_fn84(16, 4, 1)
|
inv_txfm_fn84(R, 16, 4, 1)
|
||||||
inv_txfm_fn84(16, 8, 1)
|
inv_txfm_fn84(R, 16, 8, 1)
|
||||||
inv_txfm_fn16(16, 16, 2)
|
inv_txfm_fn16( , 16, 16, 2)
|
||||||
inv_txfm_fn32(16, 32, 1)
|
inv_txfm_fn32(R, 16, 32, 1)
|
||||||
inv_txfm_fn64(16, 64, 2)
|
inv_txfm_fn64(R, 16, 64, 2)
|
||||||
inv_txfm_fn32(32, 8, 2)
|
inv_txfm_fn32(R, 32, 8, 2)
|
||||||
inv_txfm_fn32(32, 16, 1)
|
inv_txfm_fn32(R, 32, 16, 1)
|
||||||
inv_txfm_fn32(32, 32, 2)
|
inv_txfm_fn32( , 32, 32, 2)
|
||||||
inv_txfm_fn64(32, 64, 1)
|
inv_txfm_fn64(R, 32, 64, 1)
|
||||||
inv_txfm_fn64(64, 16, 2)
|
inv_txfm_fn64(R, 64, 16, 2)
|
||||||
inv_txfm_fn64(64, 32, 1)
|
inv_txfm_fn64(R, 64, 32, 1)
|
||||||
inv_txfm_fn64(64, 64, 2)
|
inv_txfm_fn64( , 64, 64, 2)
|
||||||
|
|
||||||
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
|
#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
|
||||||
ARCH_AARCH64 || \
|
ARCH_AARCH64 || \
|
||||||
|
@ -267,9 +285,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
|
||||||
assign_itx_all_fn64(64, 32, R);
|
assign_itx_all_fn64(64, 32, R);
|
||||||
assign_itx_all_fn64(64, 64, );
|
assign_itx_all_fn64(64, 64, );
|
||||||
|
|
||||||
|
int all_simd = 0;
|
||||||
#if HAVE_ASM
|
#if HAVE_ASM
|
||||||
#if ARCH_AARCH64 || ARCH_ARM
|
#if ARCH_AARCH64 || ARCH_ARM
|
||||||
itx_dsp_init_arm(c, bpc);
|
itx_dsp_init_arm(c, bpc, &all_simd);
|
||||||
#endif
|
#endif
|
||||||
#if ARCH_LOONGARCH64
|
#if ARCH_LOONGARCH64
|
||||||
itx_dsp_init_loongarch(c, bpc);
|
itx_dsp_init_loongarch(c, bpc);
|
||||||
|
@ -278,7 +297,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
|
||||||
itx_dsp_init_riscv(c, bpc);
|
itx_dsp_init_riscv(c, bpc);
|
||||||
#endif
|
#endif
|
||||||
#if ARCH_X86
|
#if ARCH_X86
|
||||||
itx_dsp_init_x86(c, bpc);
|
itx_dsp_init_x86(c, bpc, &all_simd);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (!all_simd)
|
||||||
|
dav1d_init_last_nonzero_col_from_eob_tables();
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,10 @@
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#include "common/attributes.h"
|
#include "common/attributes.h"
|
||||||
|
#include "common/intops.h"
|
||||||
|
|
||||||
#include "src/scan.h"
|
#include "src/scan.h"
|
||||||
|
#include "src/thread.h"
|
||||||
|
|
||||||
static const uint16_t ALIGN(scan_4x4[], 32) = {
|
static const uint16_t ALIGN(scan_4x4[], 32) = {
|
||||||
0, 4, 1, 2,
|
0, 4, 1, 2,
|
||||||
|
@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
|
||||||
[RTX_16X64] = scan_16x32,
|
[RTX_16X64] = scan_16x32,
|
||||||
[RTX_64X16] = scan_32x16,
|
[RTX_64X16] = scan_32x16,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static uint8_t last_nonzero_col_from_eob_4x4[16];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_8x8[64];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_16x16[256];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_32x32[1024];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_4x8[32];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_8x4[32];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_8x16[128];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_16x8[128];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_16x32[512];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_32x16[512];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_4x16[64];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_16x4[64];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_8x32[256];
|
||||||
|
static uint8_t last_nonzero_col_from_eob_32x8[256];
|
||||||
|
|
||||||
|
static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob,
|
||||||
|
const uint16_t *const scan, const int w, const int h)
|
||||||
|
{
|
||||||
|
int max_col = 0;
|
||||||
|
for (int y = 0, n = 0; y < h; y++) {
|
||||||
|
for (int x = 0; x < w; x++, n++) {
|
||||||
|
const int rc = scan[n];
|
||||||
|
const int rcx = rc & (h - 1);
|
||||||
|
max_col = imax(max_col, rcx);
|
||||||
|
last_nonzero_col_from_eob[n] = max_col;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static COLD void init_internal(void) {
|
||||||
|
init_tbl(last_nonzero_col_from_eob_4x4, scan_4x4, 4, 4);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_8x8, scan_8x8, 8, 8);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_4x8, scan_4x8, 4, 8);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_8x4, scan_8x4, 8, 4);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_8x16, scan_8x16, 8, 16);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_16x8, scan_16x8, 16, 8);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_4x16, scan_4x16, 4, 16);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_16x4, scan_16x4, 16, 4);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_8x32, scan_8x32, 8, 32);
|
||||||
|
init_tbl(last_nonzero_col_from_eob_32x8, scan_32x8, 32, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
|
||||||
|
static pthread_once_t initted = PTHREAD_ONCE_INIT;
|
||||||
|
pthread_once(&initted, init_internal);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = {
|
||||||
|
[ TX_4X4 ] = last_nonzero_col_from_eob_4x4,
|
||||||
|
[ TX_8X8 ] = last_nonzero_col_from_eob_8x8,
|
||||||
|
[ TX_16X16] = last_nonzero_col_from_eob_16x16,
|
||||||
|
[ TX_32X32] = last_nonzero_col_from_eob_32x32,
|
||||||
|
[ TX_64X64] = last_nonzero_col_from_eob_32x32,
|
||||||
|
[RTX_4X8 ] = last_nonzero_col_from_eob_4x8,
|
||||||
|
[RTX_8X4 ] = last_nonzero_col_from_eob_8x4,
|
||||||
|
[RTX_8X16 ] = last_nonzero_col_from_eob_8x16,
|
||||||
|
[RTX_16X8 ] = last_nonzero_col_from_eob_16x8,
|
||||||
|
[RTX_16X32] = last_nonzero_col_from_eob_16x32,
|
||||||
|
[RTX_32X16] = last_nonzero_col_from_eob_32x16,
|
||||||
|
[RTX_32X64] = last_nonzero_col_from_eob_32x32,
|
||||||
|
[RTX_64X32] = last_nonzero_col_from_eob_32x32,
|
||||||
|
[RTX_4X16 ] = last_nonzero_col_from_eob_4x16,
|
||||||
|
[RTX_16X4 ] = last_nonzero_col_from_eob_16x4,
|
||||||
|
[RTX_8X32 ] = last_nonzero_col_from_eob_8x32,
|
||||||
|
[RTX_32X8 ] = last_nonzero_col_from_eob_32x8,
|
||||||
|
[RTX_16X64] = last_nonzero_col_from_eob_16x32,
|
||||||
|
[RTX_64X16] = last_nonzero_col_from_eob_32x16,
|
||||||
|
};
|
||||||
|
|
|
@ -33,5 +33,8 @@
|
||||||
#include "src/levels.h"
|
#include "src/levels.h"
|
||||||
|
|
||||||
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
|
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
|
||||||
|
EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES];
|
||||||
|
|
||||||
|
void dav1d_init_last_nonzero_col_from_eob_tables(void);
|
||||||
|
|
||||||
#endif /* DAV1D_SRC_SCAN_H */
|
#endif /* DAV1D_SRC_SCAN_H */
|
||||||
|
|
|
@ -107,7 +107,9 @@ decl_itx_fns(ssse3);
|
||||||
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
|
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
|
||||||
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
|
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
|
||||||
|
|
||||||
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
|
static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
|
||||||
|
const int bpc, int *const all_simd)
|
||||||
|
{
|
||||||
#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
|
#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
|
||||||
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
|
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
|
||||||
BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
|
BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
|
||||||
|
@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
||||||
assign_itx1_fn (R, 64, 16, ssse3);
|
assign_itx1_fn (R, 64, 16, ssse3);
|
||||||
assign_itx1_fn (R, 64, 32, ssse3);
|
assign_itx1_fn (R, 64, 32, ssse3);
|
||||||
assign_itx1_fn ( , 64, 64, ssse3);
|
assign_itx1_fn ( , 64, 64, ssse3);
|
||||||
|
*all_simd = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||||
|
@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
||||||
assign_itx1_fn (R, 64, 16, sse4);
|
assign_itx1_fn (R, 64, 16, sse4);
|
||||||
assign_itx1_fn (R, 64, 32, sse4);
|
assign_itx1_fn (R, 64, 32, sse4);
|
||||||
assign_itx1_fn (, 64, 64, sse4);
|
assign_itx1_fn (, 64, 64, sse4);
|
||||||
|
*all_simd = 1;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче