Bug 1916282 - Update libdav1d to 79db1624878fa0f37841ddc2caf86f06738ae275 r=media-playback-reviewers,padenot

This patch updates the libdav1d source by running
`./mach vendor media/libdav1d/moz.yaml`

Differential Revision: https://phabricator.services.mozilla.com/D221340
This commit is contained in:
Chun-Min Chang 2024-09-09 18:17:56 +00:00
Родитель ac8a2e1e1b
Коммит 1498592440
31 изменённых файлов: 3946 добавлений и 1643 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 5ef6b241f05a2b9058b58136da4b25842aefba96 (2024-08-04T17:55:20.000-04:00).
release: 79db1624878fa0f37841ddc2caf86f06738ae275 (2024-09-06T09:04:24.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 5ef6b241f05a2b9058b58136da4b25842aefba96
revision: 79db1624878fa0f37841ddc2caf86f06738ae275
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "5ef6b241f05a2b9058b58136da4b25842aefba96"
#define DAV1D_VERSION "79db1624878fa0f37841ddc2caf86f06738ae275"

Просмотреть файл

@ -189,9 +189,13 @@ static inline int clzll(const unsigned long long mask) {
#ifndef static_assert
#define CHECK_OFFSET(type, field, name) \
struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
#define CHECK_SIZE(type, size) \
struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; }
#else
#define CHECK_OFFSET(type, field, name) \
static_assert(name == offsetof(type, field), #field)
#define CHECK_SIZE(type, size) \
static_assert(size == sizeof(type), #type)
#endif
#ifdef _MSC_VER

2
third_party/dav1d/include/compat/getopt.h поставляемый
Просмотреть файл

@ -13,7 +13,9 @@
#define __GETOPT_H__
/* All the headers include this file. */
#ifdef _WIN32
#include <crtdefs.h>
#endif
#ifdef __cplusplus
extern "C" {

8
third_party/dav1d/include/dav1d/dav1d.h поставляемый
Просмотреть файл

@ -31,10 +31,10 @@
#include <errno.h>
#include <stdarg.h>
#include "common.h"
#include "picture.h"
#include "data.h"
#include "version.h"
#include "dav1d/common.h"
#include "dav1d/picture.h"
#include "dav1d/data.h"
#include "dav1d/version.h"
#ifdef __cplusplus
extern "C" {

16
third_party/dav1d/meson.build поставляемый
Просмотреть файл

@ -157,6 +157,12 @@ else
if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
cdata.set('HAVE_POSIX_MEMALIGN', 1)
endif
if cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
cdata.set('HAVE_MEMALIGN', 1)
endif
if cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
cdata.set('HAVE_ALIGNED_ALLOC', 1)
endif
endif
# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
@ -209,6 +215,10 @@ if host_machine.cpu_family().startswith('wasm')
stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
endif
if cc.check_header('sys/types.h')
cdata.set('HAVE_SYS_TYPES_H', 1)
endif
if cc.check_header('unistd.h')
cdata.set('HAVE_UNISTD_H', 1)
endif
@ -259,6 +269,12 @@ endif
if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
endif
if cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
cdata.set('HAVE_PTHREAD_SETNAME_NP', 1)
endif
if cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
cdata.set('HAVE_PTHREAD_SET_NAME_NP', 1)
endif
if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
cdata.set('HAVE_C11_GENERIC', 1)

105
third_party/dav1d/src/arm/64/filmgrain.S поставляемый
Просмотреть файл

@ -884,12 +884,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
.else
add x4, x1, #FGD_AR_COEFFS_UV
.endif
adr x16, L(gen_grain_\type\()_tbl)
movrel x16, gen_grain_\type\()_tbl
ldr w17, [x1, #FGD_AR_COEFF_LAG]
add w9, w9, #4
ldrh w17, [x16, w17, uxtw #1]
ldrsw x17, [x16, w17, uxtw #2]
dup v31.8h, w9 // 4 + data->grain_scale_shift
sub x16, x16, w17, uxtw
add x16, x16, x17
neg v31.8h, v31.8h
.ifc \type, uv_444
@ -1075,13 +1075,14 @@ L(generate_grain_\type\()_lag3):
ldp x30, x19, [sp], #96
AARCH64_VALIDATE_LINK_REGISTER
ret
L(gen_grain_\type\()_tbl):
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
endfunc
jumptable gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
endjumptable
.endm
gen_grain_82 y
@ -1118,12 +1119,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
ldr w2, [x1, #FGD_SEED]
ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
add x4, x1, #FGD_AR_COEFFS_UV
adr x16, L(gen_grain_\type\()_tbl)
movrel x16, gen_grain_\type\()_tbl
ldr w17, [x1, #FGD_AR_COEFF_LAG]
add w9, w9, #4
ldrh w17, [x16, w17, uxtw #1]
ldrsw x17, [x16, w17, uxtw #2]
dup v31.8h, w9 // 4 + data->grain_scale_shift
sub x16, x16, w17, uxtw
add x16, x16, x17
neg v31.8h, v31.8h
cmp w13, #0
@ -1272,13 +1273,14 @@ L(generate_grain_\type\()_lag3):
ldp x30, x19, [sp], #96
AARCH64_VALIDATE_LINK_REGISTER
ret
L(gen_grain_\type\()_tbl):
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
endfunc
jumptable gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
endjumptable
.endm
gen_grain_44 uv_420
@ -1407,18 +1409,18 @@ function fgy_32x32_8bpc_neon, export=1
add_offset x5, w6, x10, x5, x9
ldr w11, [sp, #24] // type
adr x13, L(fgy_loop_tbl)
movrel x13, fgy_loop_tbl
add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
tst w11, #1
ldrh w11, [x13, w11, uxtw #1]
ldrsw x11, [x13, w11, uxtw #2]
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
sub x11, x13, w11, uxtw
add x11, x13, x11
b.eq 1f
// y overlap
@ -1555,14 +1557,15 @@ L(loop_\ox\oy):
fgy 0, 1
fgy 1, 0
fgy 1, 1
L(fgy_loop_tbl):
.hword L(fgy_loop_tbl) - L(loop_00)
.hword L(fgy_loop_tbl) - L(loop_01)
.hword L(fgy_loop_tbl) - L(loop_10)
.hword L(fgy_loop_tbl) - L(loop_11)
endfunc
jumptable fgy_loop_tbl
.word L(loop_00) - fgy_loop_tbl
.word L(loop_01) - fgy_loop_tbl
.word L(loop_10) - fgy_loop_tbl
.word L(loop_11) - fgy_loop_tbl
endjumptable
// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
// const pixel *const src,
// const ptrdiff_t stride,
@ -1646,11 +1649,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
ldr w13, [sp, #64] // type
movrel x16, overlap_coeffs_\sx
adr x14, L(fguv_loop_sx\sx\()_tbl)
movrel x14, fguv_loop_sx\sx\()_tbl
ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
tst w13, #1
ldrh w13, [x14, w13, uxtw #1]
ldrsw x13, [x14, w13, uxtw #2]
b.eq 1f
// y overlap
@ -1658,7 +1661,7 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
mov w9, #(2 >> \sy)
1:
sub x13, x14, w13, uxtw
add x13, x14, x13
.if \sy
movi v25.16b, #23
@ -1848,18 +1851,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
ldr x30, [sp], #32
AARCH64_VALIDATE_LINK_REGISTER
ret
L(fguv_loop_sx0_tbl):
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
endfunc
jumptable fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
endjumptable
function fguv_loop_sx1_neon
.macro fguv_loop_sx1 csfl, ox, oy
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@ -1997,14 +2001,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
ldr x30, [sp], #32
AARCH64_VALIDATE_LINK_REGISTER
ret
L(fguv_loop_sx1_tbl):
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
endfunc
jumptable fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
endjumptable

105
third_party/dav1d/src/arm/64/filmgrain16.S поставляемый
Просмотреть файл

@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
add x4, x1, #FGD_AR_COEFFS_UV
.endif
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
adr x16, L(gen_grain_\type\()_tbl)
movrel x16, gen_grain_\type\()_tbl
ldr w17, [x1, #FGD_AR_COEFF_LAG]
add w9, w9, #4
ldrh w17, [x16, w17, uxtw #1]
ldrsw x17, [x16, w17, uxtw #2]
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
sub x16, x16, w17, uxtw
add x16, x16, x17
neg v31.8h, v31.8h
.ifc \type, uv_444
@ -945,13 +945,14 @@ L(generate_grain_\type\()_lag3):
ldp x30, x19, [sp], #96
AARCH64_VALIDATE_LINK_REGISTER
ret
L(gen_grain_\type\()_tbl):
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
endfunc
jumptable gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
endjumptable
.endm
gen_grain_82 y
@ -991,12 +992,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
add x4, x1, #FGD_AR_COEFFS_UV
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
adr x16, L(gen_grain_\type\()_tbl)
movrel x16, gen_grain_\type\()_tbl
ldr w17, [x1, #FGD_AR_COEFF_LAG]
add w9, w9, #4
ldrh w17, [x16, w17, uxtw #1]
ldrsw x17, [x16, w17, uxtw #2]
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
sub x16, x16, w17, uxtw
add x16, x16, x17
neg v31.8h, v31.8h
cmp w13, #0
@ -1155,13 +1156,14 @@ L(generate_grain_\type\()_lag3):
ldp x30, x19, [sp], #96
AARCH64_VALIDATE_LINK_REGISTER
ret
L(gen_grain_\type\()_tbl):
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
endfunc
jumptable gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
endjumptable
.endm
gen_grain_44 uv_420
@ -1306,18 +1308,18 @@ function fgy_32x32_16bpc_neon, export=1
add_offset x5, w6, x10, x5, x9
ldr w11, [sp, #88] // type
adr x13, L(fgy_loop_tbl)
movrel x13, fgy_loop_tbl
add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
tst w11, #1
ldrh w11, [x13, w11, uxtw #1]
ldrsw x11, [x13, w11, uxtw #2]
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
sub x11, x13, w11, uxtw
add x11, x13, x11
b.eq 1f
// y overlap
@ -1480,14 +1482,15 @@ L(loop_\ox\oy):
fgy 0, 1
fgy 1, 0
fgy 1, 1
L(fgy_loop_tbl):
.hword L(fgy_loop_tbl) - L(loop_00)
.hword L(fgy_loop_tbl) - L(loop_01)
.hword L(fgy_loop_tbl) - L(loop_10)
.hword L(fgy_loop_tbl) - L(loop_11)
endfunc
jumptable fgy_loop_tbl
.word L(loop_00) - fgy_loop_tbl
.word L(loop_01) - fgy_loop_tbl
.word L(loop_10) - fgy_loop_tbl
.word L(loop_11) - fgy_loop_tbl
endjumptable
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
// const pixel *const src,
// const ptrdiff_t stride,
@ -1589,11 +1592,11 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
ldr w13, [sp, #112] // type
movrel x16, overlap_coeffs_\sx
adr x14, L(fguv_loop_sx\sx\()_tbl)
movrel x14, fguv_loop_sx\sx\()_tbl
ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
tst w13, #1
ldrh w13, [x14, w13, uxtw #1]
ldrsw x13, [x14, w13, uxtw #2]
b.eq 1f
// y overlap
@ -1601,7 +1604,7 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
mov w9, #(2 >> \sy)
1:
sub x13, x14, w13, uxtw
add x13, x14, x13
.if \sy
movi v25.8h, #23
@ -1818,18 +1821,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
ldr x30, [sp], #80
AARCH64_VALIDATE_LINK_REGISTER
ret
L(fguv_loop_sx0_tbl):
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
endfunc
jumptable fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
.word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
endjumptable
function fguv_loop_sx1_neon
.macro fguv_loop_sx1 csfl, ox, oy
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@ -1984,14 +1988,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
ldr x30, [sp], #80
AARCH64_VALIDATE_LINK_REGISTER
ret
L(fguv_loop_sx1_tbl):
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
endfunc
jumptable fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
.word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
endjumptable

650
third_party/dav1d/src/arm/64/ipred.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

626
third_party/dav1d/src/arm/64/ipred16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -28,14 +28,77 @@
#include "src/arm/asm.S"
#include "util.S"
// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
// In the comments, let RefTable denote the original, reference table.
const x_by_x_tables
// RangeMins
//
// Min(RefTable[i*8:i*8+8])
// First two values are zeroed.
//
// Lookup using RangeMins[(x >> 3)]
.byte 0, 0, 11, 8, 6, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2
.byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
// DiffMasks
//
// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
// RefTable changes at that particular index.
// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
//
// Lookup using DiffMasks[(x >> 3)]
.byte 0x00, 0x00, 0xD4, 0x44
.byte 0x42, 0x04, 0x00, 0x00
.byte 0x00, 0x80, 0x00, 0x00
.byte 0x04, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x40, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x02
// Binary form:
// 0b00000000, 0b00000000, 0b11010100, 0b01000100
// 0b01000010, 0b00000100, 0b00000000, 0b00000000
// 0b00000000, 0b10000000, 0b00000000, 0b00000000
// 0b00000100, 0b00000000, 0b00000000, 0b00000000
// 0b00000000, 0b00000000, 0b00000000, 0b00000000
// 0b00000000, 0b01000000, 0b00000000, 0b00000000
// 0b00000000, 0b00000000, 0b00000000, 0b00000000
// 0b00000000, 0b00000000, 0b00000000, 0b00000010
// RefLo
//
// RefTable[0:16]
// i.e. First 16 elements of the original table.
// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
//
// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
.byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
// Pseudo assembly
//
// hi_bits = x >> 3
// tbl ref, {RefLo}, x
// tbl diffs, {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
// tbl min, {RangeMins[0:16], RangeMins[16:32]}, hi_bits
// lo_bits = x & 0x7
// diffs = diffs << lo_bits
// ref = ref + min
// integral = popcnt(diffs)
// ref = ref + integral
// return ref
endconst
// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
// int32_t *AA, int16_t *BB,
// const int w, const int s,
// const int bitdepth_max);
function sgr_box3_vert_neon, export=1
stp d8, d9, [sp, #-0x30]!
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
add w4, w4, #2
clz w9, w6 // bitdepth_max
@ -49,41 +112,176 @@ function sgr_box3_vert_neon, export=1
movi v31.4s, #9 // n
sub w9, w9, #24 // -bitdepth_min_8
movrel x12, X(sgr_x_by_x)
movrel x12, x_by_x_tables
mov w13, #455 // one_by_x
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
movi v22.16b, #0x7
ldr q23, [x12, #64] //RefLo
dup v6.8h, w9 // -bitdepth_min_8
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
movi v29.8h, #1, lsl #8
dup v30.4s, w13 // one_by_x
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x5], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
ld1 {v20.8h, v21.8h}, [x8], #32
ld1 {v0.8h, v1.8h}, [x7], #32
1:
ld1 {v2.8h, v3.8h}, [x1], #32
add v8.4s, v8.4s, v12.4s
add v9.4s, v9.4s, v13.4s
add v10.4s, v10.4s, v14.4s
add v11.4s, v11.4s, v15.4s
add v0.8h, v0.8h, v20.8h
add v1.8h, v1.8h, v21.8h
add v16.4s, v16.4s, v8.4s
add v17.4s, v17.4s, v9.4s
add v18.4s, v18.4s, v10.4s
add v19.4s, v19.4s, v11.4s
add v4.8h, v2.8h, v0.8h
add v5.8h, v3.8h, v1.8h
srshl v16.4s, v16.4s, v7.4s
srshl v17.4s, v17.4s, v7.4s
srshl v18.4s, v18.4s, v7.4s
srshl v19.4s, v19.4s, v7.4s
srshl v9.8h, v4.8h, v6.8h
srshl v13.8h, v5.8h, v6.8h
mul v16.4s, v16.4s, v31.4s // a * n
mul v17.4s, v17.4s, v31.4s // a * n
mul v18.4s, v18.4s, v31.4s // a * n
mul v19.4s, v19.4s, v31.4s // a * n
umull v8.4s, v9.4h, v9.4h // b * b
umull2 v9.4s, v9.8h, v9.8h // b * b
umull v12.4s, v13.4h, v13.4h // b * b
umull2 v13.4s, v13.8h, v13.8h // b * b
uqsub v16.4s, v16.4s, v8.4s // imax(a * n - b * b, 0)
uqsub v17.4s, v17.4s, v9.4s // imax(a * n - b * b, 0)
uqsub v18.4s, v18.4s, v12.4s // imax(a * n - b * b, 0)
uqsub v19.4s, v19.4s, v13.4s // imax(a * n - b * b, 0)
mul v16.4s, v16.4s, v28.4s // p * s
mul v17.4s, v17.4s, v28.4s // p * s
mul v18.4s, v18.4s, v28.4s // p * s
mul v19.4s, v19.4s, v28.4s // p * s
uqshrn v16.4h, v16.4s, #16
uqshrn2 v16.8h, v17.4s, #16
uqshrn v18.4h, v18.4s, #16
uqshrn2 v18.8h, v19.4s, #16
uqrshrn v1.8b, v16.8h, #4 // imin(z, 255)
uqrshrn2 v1.16b, v18.8h, #4 // imin(z, 255)
ld1 {v16.4s, v17.4s}, [x0], #32
subs w4, w4, #16
ushr v0.16b, v1.16b, #3
ld1 {v8.4s, v9.4s}, [x5], #32
tbl v2.16b, {v26.16b, v27.16b}, v0.16b // RangeMins
tbl v0.16b, {v24.16b, v25.16b}, v0.16b // DiffMasks
tbl v3.16b, {v23.16b}, v1.16b // RefLo
and v1.16b, v1.16b, v22.16b
ld1 {v12.4s, v13.4s}, [x6], #32
ushl v1.16b, v2.16b, v1.16b
ld1 {v20.8h, v21.8h}, [x8], #32
add v3.16b, v3.16b, v0.16b
cnt v1.16b, v1.16b
ld1 {v18.4s, v19.4s}, [x0], #32
add v3.16b, v3.16b, v1.16b
ld1 {v10.4s, v11.4s}, [x5], #32
uxtl v0.8h, v3.8b // x
uxtl2 v1.8h, v3.16b // x
ld1 {v14.4s, v15.4s}, [x6], #32
umull v2.4s, v0.4h, v4.4h // x * BB[i]
umull2 v3.4s, v0.8h, v4.8h // x * BB[i]
umull v4.4s, v1.4h, v5.4h // x * BB[i]
umull2 v5.4s, v1.8h, v5.8h // x * BB[i]
sub v0.8h, v29.8h, v0.8h // 256 - x
sub v1.8h, v29.8h, v1.8h // 256 - x
mul v2.4s, v2.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v5.4s, v5.4s, v30.4s // x * BB[i] * sgr_one_by_x
st1 {v0.8h, v1.8h}, [x3], #32
ld1 {v0.8h, v1.8h}, [x7], #32
srshr v2.4s, v2.4s, #12 // AA[i]
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
srshr v5.4s, v5.4s, #12 // AA[i]
st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
b.gt 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
// int32_t *AA, int16_t *BB,
// const int w, const int s,
// const int bitdepth_max);
function sgr_box5_vert_neon, export=1
stp d8, d9, [sp, #-0x30]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
add w4, w4, #2
clz w15, w6 // bitdepth_max
dup v28.4s, w5 // strength
ldp x5, x6, [x0]
ldp x7, x8, [x0, #16]
ldr x0, [x0, #32]
ldp x9, x10, [x1]
ldp x11, x12, [x1, #16]
ldr x1, [x1, #32]
movi v31.4s, #25 // n
sub w15, w15, #24 // -bitdepth_min_8
movrel x13, x_by_x_tables
movi v30.4s, #164
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
dup v6.8h, w15 // -bitdepth_min_8
movi v19.8b, #0x7
ldr q18, [x13, #64] // RefLo
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
movi v29.8h, #1, lsl #8
ld1 {v8.4s, v9.4s}, [x5], #32
ld1 {v10.4s, v11.4s}, [x6], #32
ld1 {v12.8h}, [x7], #16
ld1 {v13.8h}, [x8], #16
ld1 {v0.4s, v1.4s}, [x0], #32
ld1 {v12.4s, v13.4s}, [x7], #32
ld1 {v16.4s, v17.4s}, [x8], #32
ld1 {v20.8h}, [x9], #16
ld1 {v21.8h}, [x10], #16
ld1 {v22.8h}, [x11], #16
ld1 {v23.8h}, [x12], #16
ld1 {v0.4s, v1.4s}, [x0], #32
ld1 {v2.8h}, [x1], #16
1:
1:
add v8.4s, v8.4s, v10.4s
add v9.4s, v9.4s, v11.4s
add v12.4s, v12.4s, v16.4s
add v13.4s, v13.4s, v17.4s
add v12.8h, v12.8h, v13.8h
add v20.8h, v20.8h, v21.8h
add v22.8h, v22.8h, v23.8h
subs w4, w4, #8
add v0.4s, v0.4s, v8.4s
add v1.4s, v1.4s, v9.4s
add v2.8h, v2.8h, v12.8h
add v2.8h, v2.8h, v20.8h
add v0.4s, v0.4s, v12.4s
add v1.4s, v1.4s, v13.4s
add v2.8h, v2.8h, v22.8h
subs w4, w4, #8
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
@ -102,24 +300,25 @@ function sgr_box3_vert_neon, export=1
ld1 {v10.4s, v11.4s}, [x6], #32
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
ld1 {v12.8h}, [x7], #16
ld1 {v12.4s, v13.4s}, [x7], #32
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v5.8b, v1.8b, v5.8b
ld1 {v13.8h}, [x8], #16
add v5.8b, v5.8b, v25.8b
ld1 {v0.4s, v1.4s}, [x0], #32
ushr v1.8b, v0.8b, #3
ld1 {v16.4s, v17.4s}, [x8], #32
tbl v5.8b, {v26.16b, v27.16b}, v1.8b // RangeMins
tbl v1.8b, {v24.16b, v25.16b}, v1.8b // DiffMasks
tbl v4.8b, {v18.16b}, v0.8b // RefLo
and v0.8b, v0.8b, v19.8b
ld1 {v20.8h}, [x9], #16
ushl v5.8b, v5.8b, v0.8b
add v4.8b, v4.8b, v1.8b
ld1 {v21.8h}, [x10], #16
cnt v5.8b, v5.8b
ld1 {v22.8h}, [x11], #16
add v5.8b, v4.8b, v5.8b
ld1 {v23.8h}, [x12], #16
uxtl v5.8h, v5.8b // x
ld1 {v0.4s, v1.4s}, [x0], #32
umull v3.4s, v5.4h, v2.4h // x * BB[i]
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
@ -138,135 +337,3 @@ function sgr_box3_vert_neon, export=1
ldp d8, d9, [sp], 0x30
ret
endfunc
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
// int32_t *AA, int16_t *BB,
// const int w, const int s,
// const int bitdepth_max);
function sgr_box5_vert_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
add w4, w4, #2
clz w15, w6 // bitdepth_max
dup v28.4s, w5 // strength
ldp x5, x6, [x0]
ldp x7, x8, [x0, #16]
ldr x0, [x0, #32]
ldp x9, x10, [x1]
ldp x11, x12, [x1, #16]
ldr x1, [x1, #32]
movi v31.4s, #25 // n
sub w15, w15, #24 // -bitdepth_min_8
movrel x13, X(sgr_x_by_x)
mov w14, #164 // one_by_x
ld1 {v16.16b, v17.16b, v18.16b}, [x13]
dup v6.8h, w15 // -bitdepth_min_8
movi v19.16b, #5
movi v24.8b, #254 // idx of last 1
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
movi v29.8h, #1, lsl #8
dup v30.4s, w14 // one_by_x
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
ld1 {v8.4s, v9.4s}, [x5], #32
ld1 {v10.4s, v11.4s}, [x6], #32
ld1 {v12.4s, v13.4s}, [x7], #32
ld1 {v14.4s, v15.4s}, [x8], #32
ld1 {v20.8h}, [x9], #16
ld1 {v21.8h}, [x10], #16
ld1 {v22.8h}, [x11], #16
ld1 {v23.8h}, [x12], #16
ld1 {v0.4s, v1.4s}, [x0], #32
ld1 {v2.8h}, [x1], #16
1:
add v8.4s, v8.4s, v10.4s
add v9.4s, v9.4s, v11.4s
add v12.4s, v12.4s, v14.4s
add v13.4s, v13.4s, v15.4s
add v20.8h, v20.8h, v21.8h
add v22.8h, v22.8h, v23.8h
add v0.4s, v0.4s, v8.4s
add v1.4s, v1.4s, v9.4s
add v2.8h, v2.8h, v20.8h
add v0.4s, v0.4s, v12.4s
add v1.4s, v1.4s, v13.4s
add v2.8h, v2.8h, v22.8h
subs w4, w4, #8
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
srshl v4.8h, v2.8h, v6.8h
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v4.4h, v4.4h // b * b
umull2 v4.4s, v4.8h, v4.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
ld1 {v8.4s, v9.4s}, [x5], #32
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
ld1 {v10.4s, v11.4s}, [x6], #32
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
ld1 {v12.4s, v13.4s}, [x7], #32
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
ld1 {v14.4s, v15.4s}, [x8], #32
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
ld1 {v20.8h}, [x9], #16
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
ld1 {v21.8h}, [x10], #16
add v5.8b, v1.8b, v5.8b
ld1 {v22.8h}, [x11], #16
add v5.8b, v5.8b, v25.8b
ld1 {v23.8h}, [x12], #16
uxtl v5.8h, v5.8b // x
ld1 {v0.4s, v1.4s}, [x0], #32
umull v3.4s, v5.4h, v2.4h // x * BB[i]
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v5.8h, v29.8h, v5.8h // 256 - x
ld1 {v2.8h}, [x1], #16
st1 {v3.4s, v4.4s}, [x2], #32
st1 {v5.8h}, [x3], #16
b.gt 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc

601
third_party/dav1d/src/arm/64/mc.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

658
third_party/dav1d/src/arm/64/mc16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1649
third_party/dav1d/src/arm/64/mc16_sve.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

331
third_party/dav1d/src/arm/64/mc_dotprod.S поставляемый
Просмотреть файл

@ -54,8 +54,14 @@ const h_tbl_neon_dotprod, align=4
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
// Shuffle indices to permute horizontal samples in preparation for
// input to USMMLA instructions.
#define OFFSET_USMMLA 48
.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
.byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
#define OFFSET_CVT_32_8 48
#define OFFSET_CVT_32_8 80
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
endconst
@ -114,10 +120,10 @@ L(\type\()_8tap_v_\isa):
sub \src, \src, \s_strd
.ifc \isa, neon_dotprod
.ifc \type, prep
mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding
mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding
dup v4.4s, w8
.else
movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT
movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT
.endif
.endif
ubfx w11, \my, #7, #7
@ -677,18 +683,18 @@ L(\type\()_8tap_h_hv_\isa):
madd \mx, \mx, w11, w9
madd w14, \my, w11, w10 // for HV
.ifc \isa, neon_dotprod
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding
dup v27.4s, w13 // put H overrides this
.endif
movrel x13, h_tbl_neon_dotprod
sub \src, \src, #3 // src - 3
ldr q28, [x13]
ubfx w9, \mx, #7, #7
ldr q28, [x13] // for 4-tap & 8-tap H filters
ubfx w15, \mx, #7, #7
and \mx, \mx, #0x7F
ubfx w11, w14, #7, #7 // for HV
and w14, w14, #0x7F // for HV
cmp \w, #4
csel \mx, \mx, w9, le
csel \mx, \mx, w15, le
add \xmx, x12, \xmx, lsl #3 // subpel H filter address
.ifc \isa, neon_dotprod
movi v24.16b, #128
@ -706,7 +712,7 @@ L(\type\()_8tap_h_hv_\isa):
ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
.endif // of 32b values to 8b
sxtl v7.8h, v7.8b
cmp w10, SHARP1
cmp w10, #SHARP1
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
// HV 8-tap cases
@ -1005,11 +1011,92 @@ L(\type\()_6tap_hv_\isa):
// .align JUMP_ALIGN // fallthrough
80: // HV6 - 8xN+
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \type, prep
add \wd_strd, \w, \w
.endif
.ifc \isa, neon_i8mm
cmp w9, #SHARP1
b.eq 88f // horizontal == SHARP1
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
ext v0.8b, v26.8b, v26.8b, #7
ins v26.d[1], v0.d[0]
.align LOOP_ALIGN
81:
mov \lsrc, \src
mov \ldst, \dst
mov w8, \h
bl L(\type\()_hv_filter6_neon_i8mm)
srshr v16.8h, v22.8h, #2
bl L(\type\()_hv_filter6_neon_i8mm)
srshr v17.8h, v22.8h, #2
bl L(\type\()_hv_filter6_neon_i8mm)
srshr v18.8h, v22.8h, #2
bl L(\type\()_hv_filter6_neon_i8mm)
srshr v19.8h, v22.8h, #2
bl L(\type\()_hv_filter6_neon_i8mm)
srshr v20.8h, v22.8h, #2
.align LOOP_ALIGN
8:
ld1 {v23.16b}, [\lsrc], \s_strd
smull v0.4s, v16.4h, v7.h[1]
smull2 v1.4s, v16.8h, v7.h[1]
mov v16.16b, v17.16b
movi v5.4s, #0
movi v6.4s, #0
tbl v2.16b, {v23.16b}, v29.16b
tbl v3.16b, {v23.16b}, v30.16b
smlal v0.4s, v17.4h, v7.h[2]
smlal2 v1.4s, v17.8h, v7.h[2]
mov v17.16b, v18.16b
usmmla v5.4s, v2.16b, v26.16b
usmmla v6.4s, v3.16b, v26.16b
smlal v0.4s, v18.4h, v7.h[3]
smlal2 v1.4s, v18.8h, v7.h[3]
mov v18.16b, v19.16b
subs w8, w8, #1
smlal v0.4s, v19.4h, v7.h[4]
smlal2 v1.4s, v19.8h, v7.h[4]
uzp1 v23.8h, v5.8h, v6.8h
mov v19.16b, v20.16b
smlal v0.4s, v20.4h, v7.h[5]
smlal2 v1.4s, v20.8h, v7.h[5]
srshr v20.8h, v23.8h, #2
smlal v0.4s, v20.4h, v7.h[6]
smlal2 v1.4s, v20.8h, v7.h[6]
.ifc \type, prep
rshrn v0.4h, v0.4s, #6
rshrn2 v0.8h, v1.4s, #6
st1 {v0.8h}, [\ldst], \d_strd
b.gt 8b
add \dst, \dst, #16
.else
tbl v0.16b, {v0.16b, v1.16b}, v25.16b
sqrshrun v0.8b, v0.8h, #2
st1 {v0.8b}, [\ldst], \d_strd
b.gt 8b
add \dst, \dst, #8
.endif
add \src, \src, #8
subs \w, \w, #8
b.gt 81b
ret x15
.align JUMP_ALIGN
88:
.endif // neon_i8mm
ldp q29, q30, [x13, #16]
.align LOOP_ALIGN
81:
mov \lsrc, \src
@ -1040,8 +1127,8 @@ L(\type\()_6tap_hv_\isa):
.endif
.align LOOP_ALIGN
8:
ldr q23, [\xmy]
add \xmy, \xmy, \s_strd
ldr q23, [\lsrc]
add \lsrc, \lsrc, \s_strd
smull v0.4s, v16.4h, v7.h[1]
smull2 v1.4s, v16.8h, v7.h[1]
@ -1128,6 +1215,20 @@ L(\type\()_hv_filter8_\isa):
uzp1 v22.8h, v22.8h, v23.8h
ret
.ifc \isa, neon_i8mm
.align FUNC_ALIGN
L(\type\()_hv_filter6_neon_i8mm):
ld1 {v4.16b}, [\lsrc], \s_strd
movi v22.4s, #0
movi v23.4s, #0
tbl v2.16b, {v4.16b}, v29.16b
tbl v3.16b, {v4.16b}, v30.16b
usmmla v22.4s, v2.16b, v26.16b
usmmla v23.4s, v3.16b, v26.16b
uzp1 v22.8h, v22.8h, v23.8h
ret
.endif
.align FUNC_ALIGN
L(\type\()_hv_filter4_\isa):
ld1 {v4.8b}, [\src], \s_strd
@ -1264,8 +1365,8 @@ L(\type\()_hv_filter4_\isa):
.align JUMP_ALIGN
L(\type\()_8tap_h_\isa):
adr x9, L(\type\()_8tap_h_\isa\()_tbl)
ldrh w8, [x9, x8, lsl #1]
movrel x11, \type\()_8tap_h_\isa\()_tbl
ldrsw x8, [x11, x8, lsl #2]
.ifc \type, put
.ifc \isa, neon_i8mm
movi v27.4s, #34 // special rounding
@ -1274,8 +1375,8 @@ L(\type\()_8tap_h_\isa):
dup v27.4s, w10
.endif
.endif
sub x9, x9, x8
br x9
add x11, x11, x8
br x11
.ifc \type, put
.align JUMP_ALIGN
@ -1368,8 +1469,63 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN
80: // H - 8xN
AARCH64_VALID_JUMP_TARGET
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \isa, neon_i8mm
cmp w9, #SHARP1
b.eq 88f // horizontal == SHARP1
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
ext v0.8b, v26.8b, v26.8b, #7
ins v26.d[1], v0.d[0]
.align LOOP_ALIGN
8:
ldr q0, [\src]
ldr q16, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
.ifc \type, prep
movi v4.4s, #0
movi v5.4s, #0
movi v20.4s, #0
movi v21.4s, #0
.else
mov v4.16b, v27.16b
mov v5.16b, v27.16b
mov v20.16b, v27.16b
mov v21.16b, v27.16b
.endif
tbl v1.16b, {v0.16b}, v29.16b
tbl v2.16b, {v0.16b}, v30.16b
tbl v17.16b, {v16.16b}, v29.16b
tbl v18.16b, {v16.16b}, v30.16b
usmmla v4.4s, v1.16b, v26.16b
usmmla v5.4s, v2.16b, v26.16b
usmmla v20.4s, v17.16b, v26.16b
usmmla v21.4s, v18.16b, v26.16b
uzp1 v4.8h, v4.8h, v5.8h
uzp1 v20.8h, v20.8h, v21.8h
.ifc \type, prep
srshr v4.8h, v4.8h, #2
srshr v20.8h, v20.8h, #2
subs \h, \h, #2
stp q4, q20, [\dst], #32
.else // put
sqshrun v4.8b, v4.8h, #6
sqshrun v20.8b, v20.8h, #6
subs \h, \h, #2
str d4, [\dst]
str d20, [\dst, \d_strd]
add \dst, \dst, \d_strd, lsl #1
.endif
b.gt 8b
ret
.align JUMP_ALIGN
88:
.endif // neon_i8mm
ldp q29, q30, [x13, #16]
.align LOOP_ALIGN
8:
@ -1433,8 +1589,61 @@ L(\type\()_8tap_h_\isa):
.align JUMP_ALIGN
160: // H - 16xN
AARCH64_VALID_JUMP_TARGET
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \isa, neon_i8mm
cmp w9, #SHARP1
b.eq 168f // horizontal == SHARP1
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
ext v0.8b, v26.8b, v26.8b, #7
ins v26.d[1], v0.d[0]
.align LOOP_ALIGN
16:
ldr q16, [\src]
ldur q17, [\src, #8] // avoid 2 register TBL for small cores
add \src, \src, \s_strd
.ifc \type, prep
movi v6.4s, #0
movi v7.4s, #0
movi v22.4s, #0
movi v23.4s, #0
.else
mov v6.16b, v27.16b
mov v7.16b, v27.16b
mov v22.16b, v27.16b
mov v23.16b, v27.16b
.endif
tbl v0.16b, {v16.16b}, v29.16b
tbl v1.16b, {v16.16b}, v30.16b
tbl v2.16b, {v17.16b}, v29.16b
tbl v3.16b, {v17.16b}, v30.16b
usmmla v6.4s, v0.16b, v26.16b
usmmla v7.4s, v1.16b, v26.16b
usmmla v22.4s, v2.16b, v26.16b
usmmla v23.4s, v3.16b, v26.16b
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v22.8h, v22.8h, v23.8h
.ifc \type, prep
srshr v6.8h, v6.8h, #2
srshr v22.8h, v22.8h, #2
subs \h, \h, #1
stp q6, q22, [\dst], #32
.else // put
sqshrun v6.8b, v6.8h, #6
sqshrun2 v6.16b, v22.8h, #6
subs \h, \h, #1
st1 {v6.16b}, [\dst], \d_strd
.endif
b.gt 16b
ret
.align JUMP_ALIGN
168:
.endif // neon_i8mm
ldp q29, q30, [x13, #16]
.align LOOP_ALIGN
16:
@ -1497,7 +1706,6 @@ L(\type\()_8tap_h_\isa):
640:
1280:
AARCH64_VALID_JUMP_TARGET
ldp q29, q30, [x13, #16]
ldr d26, [\xmx]
.ifc \type, put
sub \d_strd, \d_strd, \w, uxtw
@ -1505,6 +1713,69 @@ L(\type\()_8tap_h_\isa):
sub \s_strd, \s_strd, \w, uxtw
mov w8, \w
.ifc \isa, neon_i8mm
cmp w9, #SHARP1
b.eq 328f // horizontal == SHARP1
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
ext v0.8b, v26.8b, v26.8b, #7
ins v26.d[1], v0.d[0]
.align LOOP_ALIGN
32:
ldr q16, [\src]
ldur q17, [\src, #8] // avoid 2 register TBL for small cores
add \src, \src, #16
.ifc \type, prep
movi v6.4s, #0
movi v7.4s, #0
movi v22.4s, #0
movi v23.4s, #0
.else
mov v6.16b, v27.16b
mov v7.16b, v27.16b
mov v22.16b, v27.16b
mov v23.16b, v27.16b
.endif
tbl v0.16b, {v16.16b}, v29.16b
tbl v1.16b, {v16.16b}, v30.16b
tbl v2.16b, {v17.16b}, v29.16b
tbl v3.16b, {v17.16b}, v30.16b
usmmla v6.4s, v0.16b, v26.16b
usmmla v7.4s, v1.16b, v26.16b
usmmla v22.4s, v2.16b, v26.16b
usmmla v23.4s, v3.16b, v26.16b
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v22.8h, v22.8h, v23.8h
.ifc \type, prep
srshr v6.8h, v6.8h, #2
srshr v22.8h, v22.8h, #2
subs w8, w8, #16
stp q6, q22, [\dst], #32
.else // put
sqshrun v6.8b, v6.8h, #6
sqshrun2 v6.16b, v22.8h, #6
subs w8, w8, #16
str q6, [\dst], #16
.endif
b.gt 32b
add \src, \src, \s_strd
.ifc \type, put
add \dst, \dst, \d_strd
.endif
mov w8, \w
subs \h, \h, #1
b.gt 32b
ret
.align JUMP_ALIGN
328:
.endif // neon_i8mm
ldp q29, q30, [x13, #16]
.align LOOP_ALIGN
32:
ldr q16, [\src]
@ -1568,19 +1839,19 @@ L(\type\()_8tap_h_\isa):
subs \h, \h, #1
b.gt 32b
ret
L(\type\()_8tap_h_\isa\()_tbl):
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
.ifc \type, put
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
.hword 0
.endif
endfunc
jumptable \type\()_8tap_h_\isa\()_tbl
.word 1280b - \type\()_8tap_h_\isa\()_tbl
.word 640b - \type\()_8tap_h_\isa\()_tbl
.word 320b - \type\()_8tap_h_\isa\()_tbl
.word 160b - \type\()_8tap_h_\isa\()_tbl
.word 80b - \type\()_8tap_h_\isa\()_tbl
.word 40b - \type\()_8tap_h_\isa\()_tbl
.ifc \type, put
.word 20b - \type\()_8tap_h_\isa\()_tbl
.endif
endjumptable
.endm
// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)

134
third_party/dav1d/src/arm/64/refmvs.S поставляемый
Просмотреть файл

@ -34,13 +34,13 @@
function splat_mv_neon, export=1
ld1 {v3.16b}, [x1]
clz w3, w3
adr x5, L(splat_tbl)
movrel x5, splat_tbl
sub w3, w3, #26
ext v2.16b, v3.16b, v3.16b, #12
ldrh w3, [x5, w3, uxtw #1]
ldrsw x3, [x5, w3, uxtw #2]
add w2, w2, w2, lsl #1
ext v0.16b, v2.16b, v3.16b, #4
sub x3, x5, w3, uxtw
add x3, x5, x3
ext v1.16b, v2.16b, v3.16b, #8
lsl w2, w2, #2
ext v2.16b, v2.16b, v3.16b, #12
@ -80,16 +80,17 @@ function splat_mv_neon, export=1
st1 {v0.16b, v1.16b, v2.16b}, [x1]
b.gt 1b
ret
L(splat_tbl):
.hword L(splat_tbl) - 320b
.hword L(splat_tbl) - 160b
.hword L(splat_tbl) - 80b
.hword L(splat_tbl) - 40b
.hword L(splat_tbl) - 20b
.hword L(splat_tbl) - 10b
endfunc
jumptable splat_tbl
.word 320b - splat_tbl
.word 160b - splat_tbl
.word 80b - splat_tbl
.word 40b - splat_tbl
.word 20b - splat_tbl
.word 10b - splat_tbl
endjumptable
const mv_tbls, align=4
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
@ -112,7 +113,7 @@ function save_tmvs_neon, export=1
movi v30.8b, #0
ld1 {v31.8b}, [x3]
adr x8, L(save_tmvs_tbl)
movrel x8, save_tmvs_tbl
movrel x16, mask_mult
movrel x13, mv_tbls
ld1 {v29.8b}, [x16]
@ -137,9 +138,9 @@ function save_tmvs_neon, export=1
2:
ldrb w11, [x9, #10] // cand_b->bs
ld1 {v0.16b}, [x9] // cand_b->mv
add x11, x8, w11, uxtw #2
add x11, x8, w11, uxtw #3
ldr h1, [x9, #8] // cand_b->ref
ldrh w12, [x11] // bw8
ldr w12, [x11] // bw8
mov x15, x8
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
cmp x9, x10
@ -149,9 +150,9 @@ function save_tmvs_neon, export=1
ldrb w15, [x9, #10] // cand_b->bs
add x16, x9, #8
ld1 {v4.16b}, [x9] // cand_b->mv
add x15, x8, w15, uxtw #2
add x15, x8, w15, uxtw #3
ld1 {v1.h}[1], [x16] // cand_b->ref
ldrh w12, [x15] // bw8
ldr w12, [x15] // bw8
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
trn1 v2.2d, v0.2d, v4.2d
@ -166,12 +167,12 @@ function save_tmvs_neon, export=1
addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
umov w16, v1.h[0] // Extract case for first block
umov w17, v1.h[1]
ldrh w11, [x11, #2] // Fetch jump table entry
ldrh w15, [x15, #2]
ldrsw x11, [x11, #4] // Fetch jump table entry
ldrsw x15, [x15, #4]
ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
ldr q5, [x13, w17, uxtw #4]
sub x11, x8, w11, uxtw // Find jump table target
sub x15, x8, w15, uxtw
add x11, x8, x11 // Find jump table target
add x15, x8, x15
tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
tbl v4.16b, {v4.16b}, v5.16b
@ -243,50 +244,51 @@ function save_tmvs_neon, export=1
str q2, [x3, #(16*5-16)]
add x3, x3, #16*5
ret
L(save_tmvs_tbl):
.hword 16 * 12
.hword L(save_tmvs_tbl) - 160b
.hword 16 * 12
.hword L(save_tmvs_tbl) - 160b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
endfunc
jumptable save_tmvs_tbl
.word 16 * 12
.word 160b - save_tmvs_tbl
.word 16 * 12
.word 160b - save_tmvs_tbl
.word 8 * 12
.word 80b - save_tmvs_tbl
.word 8 * 12
.word 80b - save_tmvs_tbl
.word 8 * 12
.word 80b - save_tmvs_tbl
.word 8 * 12
.word 80b - save_tmvs_tbl
.word 4 * 12
.word 40b - save_tmvs_tbl
.word 4 * 12
.word 40b - save_tmvs_tbl
.word 4 * 12
.word 40b - save_tmvs_tbl
.word 4 * 12
.word 40b - save_tmvs_tbl
.word 2 * 12
.word 20b - save_tmvs_tbl
.word 2 * 12
.word 20b - save_tmvs_tbl
.word 2 * 12
.word 20b - save_tmvs_tbl
.word 2 * 12
.word 20b - save_tmvs_tbl
.word 2 * 12
.word 20b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
.word 1 * 12
.word 10b - save_tmvs_tbl
endjumptable

26
third_party/dav1d/src/arm/asm.S поставляемый
Просмотреть файл

@ -323,6 +323,32 @@ EXTERN\name:
\name:
.endm
.macro jumptable name
#ifdef _WIN32
// MS armasm64 doesn't seem to be able to create relocations for subtraction
// of labels in different sections; for armasm64 (and all of Windows for
// simplicity), write the jump table in the text section, to allow calculating
// differences at assembly time. See
// https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340
// for reference. (LLVM can create such relocations, but checking for _WIN32
// for simplicity, as execute-only memory isn't relevant on Windows at the
// moment.)
function \name
#else
// For other platforms, write jump tables in a const data section, to allow
// working in environments where executable memory isn't readable.
const \name
#endif
.endm
.macro endjumptable
#ifdef _WIN32
endfunc
#else
endconst
#endif
.endm
#ifdef __APPLE__
#define L(x) L ## x
#else

45
third_party/dav1d/src/arm/cpu.c поставляемый
Просмотреть файл

@ -29,6 +29,7 @@
#include "common/attributes.h"
#include "src/cpu.h"
#include "src/arm/cpu.h"
#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
#endif
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
unsigned flags = dav1d_get_default_cpu_flags();
flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
@ -75,7 +76,8 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
#endif
unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
unsigned flags = dav1d_get_default_cpu_flags();
flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
return flags;
@ -95,7 +97,7 @@ static int have_feature(const char *feature) {
}
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
unsigned flags = dav1d_get_default_cpu_flags();
if (have_feature("hw.optional.arm.FEAT_DotProd"))
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
if (have_feature("hw.optional.arm.FEAT_I8MM"))
@ -104,16 +106,14 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
return flags;
}
#elif defined(__OpenBSD__)
#if ARCH_AARCH64
#elif defined(__OpenBSD__) && ARCH_AARCH64
#include <machine/armreg.h>
#include <machine/cpu.h>
#include <sys/types.h>
#include <sys/sysctl.h>
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
unsigned flags = dav1d_get_default_cpu_flags();
#ifdef CPU_ID_AA64ISAR0
int mib[2];
@ -142,25 +142,31 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
return flags;
}
#else /* !ARCH_AARCH64 */
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
return flags;
}
#endif /* ARCH_AARCH64 */
#elif defined(_WIN32)
#include <windows.h>
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
unsigned flags = dav1d_get_default_cpu_flags();
#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
#endif
/* No I8MM or SVE feature detection available on Windows at the time of
* writing. */
#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE))
flags |= DAV1D_ARM_CPU_FLAG_SVE;
#endif
#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE))
flags |= DAV1D_ARM_CPU_FLAG_SVE2;
#endif
#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
/* There's no PF_* flag that indicates whether plain I8MM is available
* or not. But if SVE_I8MM is available, that also implies that
* regular I8MM is available. */
if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE))
flags |= DAV1D_ARM_CPU_FLAG_I8MM;
#endif
return flags;
}
@ -206,7 +212,8 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
}
COLD unsigned dav1d_get_cpu_flags_arm(void) {
unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
unsigned flags = dav1d_get_default_cpu_flags();
flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
@ -220,7 +227,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
#else /* Unsupported OS */
COLD unsigned dav1d_get_cpu_flags_arm(void) {
return 0;
return dav1d_get_default_cpu_flags();
}
#endif

27
third_party/dav1d/src/arm/mc.h поставляемый
Просмотреть файл

@ -63,6 +63,7 @@
decl_8tap_fns(neon);
decl_8tap_fns(neon_dotprod);
decl_8tap_fns(neon_i8mm);
decl_8tap_fns(sve2);
decl_mc_fn(BF(dav1d_put_bilin, neon));
decl_mct_fn(BF(dav1d_prep_bilin, neon));
@ -110,17 +111,27 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
c->emu_edge = BF(dav1d_emu_edge, neon);
#if ARCH_AARCH64 && BITDEPTH == 8
#if ARCH_AARCH64
#if BITDEPTH == 8
#if HAVE_DOTPROD
if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
init_8tap_fns(neon_dotprod);
if (flags & DAV1D_ARM_CPU_FLAG_DOTPROD) {
init_8tap_fns(neon_dotprod);
}
#endif // HAVE_DOTPROD
#if HAVE_I8MM
if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
init_8tap_fns(neon_i8mm);
if (flags & DAV1D_ARM_CPU_FLAG_I8MM) {
init_8tap_fns(neon_i8mm);
}
#endif // HAVE_I8MM
#endif // ARCH_AARCH64 && BITDEPTH == 8
#endif // BITDEPTH == 8
#if BITDEPTH == 16
#if HAVE_SVE2
if (flags & DAV1D_ARM_CPU_FLAG_SVE2) {
init_8tap_fns(sve2);
}
#endif // HAVE_SVE2
#endif // BITDEPTH == 16
#endif // ARCH_AARCH64
}

10
third_party/dav1d/src/cpu.c поставляемый
Просмотреть файл

@ -33,20 +33,24 @@
#ifdef _WIN32
#include <windows.h>
#elif defined(__APPLE__)
#endif
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/types.h>
#else
#include <pthread.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_PTHREAD_GETAFFINITY_NP
#include <pthread.h>
#ifdef HAVE_PTHREAD_NP_H
#include <pthread_np.h>
#endif
#if defined(__FreeBSD__)
#define cpu_set_t cpuset_t
#endif
#endif
unsigned dav1d_cpu_flags = 0U;
unsigned dav1d_cpu_flags_mask = ~0U;

18
third_party/dav1d/src/cpu.h поставляемый
Просмотреть файл

@ -54,12 +54,9 @@ void dav1d_init_cpu(void);
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
int dav1d_num_logical_processors(Dav1dContext *c);
static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
unsigned flags = 0;
#if TRIM_DSP_FUNCTIONS
/* Since this function is inlined, unconditionally setting a flag here will
* enable dead code elimination in the calling function. */
#if ARCH_AARCH64 || ARCH_ARM
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
@ -119,6 +116,17 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
flags |= DAV1D_X86_CPU_FLAG_SSE2;
#endif
#endif
return flags;
}
static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
#if TRIM_DSP_FUNCTIONS
/* Since this function is inlined, unconditionally setting a flag here will
* enable dead code elimination in the calling function. */
flags |= dav1d_get_default_cpu_flags();
#endif
return flags;

4
third_party/dav1d/src/loongarch/cpu.c поставляемый
Просмотреть файл

@ -26,6 +26,8 @@
#include "config.h"
#include "common/attributes.h"
#include "src/cpu.h"
#include "src/loongarch/cpu.h"
#if defined(HAVE_GETAUXVAL)
@ -36,7 +38,7 @@
#endif
COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
unsigned flags = 0;
unsigned flags = dav1d_get_default_cpu_flags();
#if defined(HAVE_GETAUXVAL)
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;

18
third_party/dav1d/src/mem.c поставляемый
Просмотреть файл

@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
void *dav1d_alloc_aligned(const enum AllocationType type,
const size_t sz, const size_t align)
{
assert(!(align & (align - 1)));
void *ptr;
#ifdef _WIN32
ptr = _aligned_malloc(sz + align, align);
#elif defined(HAVE_POSIX_MEMALIGN)
if (posix_memalign(&ptr, align, sz + align)) return NULL;
#else
ptr = memalign(align, sz + align);
#endif
void *const ptr = dav1d_alloc_aligned_internal(align, sz + align);
return track_alloc(type, ptr, sz, align);
}
@ -140,12 +131,7 @@ void dav1d_free(void *ptr) {
void dav1d_free_aligned(void *ptr) {
if (ptr) {
ptr = track_free(ptr);
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
dav1d_free_aligned_internal(track_free(ptr));
}
}

64
third_party/dav1d/src/mem.h поставляемый
Просмотреть файл

@ -32,7 +32,7 @@
#include <stdlib.h>
#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
#if defined(_WIN32) || defined(HAVE_MEMALIGN)
#include <malloc.h>
#endif
@ -79,6 +79,39 @@ typedef struct Dav1dMemPool {
#endif
} Dav1dMemPool;
// TODO: Move this to a common location?
#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
/*
* Allocate align-byte aligned memory. The return value can be released
* by calling the dav1d_free_aligned() function.
*/
static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
assert(!(align & (align - 1)));
#ifdef _WIN32
return _aligned_malloc(sz, align);
#elif defined(HAVE_POSIX_MEMALIGN)
void *ptr;
if (posix_memalign(&ptr, align, sz)) return NULL;
return ptr;
#elif defined(HAVE_MEMALIGN)
return memalign(align, sz);
#elif defined(HAVE_ALIGNED_ALLOC)
// The C11 standard specifies that the size parameter
// must be an integral multiple of alignment.
return aligned_alloc(align, ROUND_UP(sz, align));
#else
#error No aligned allocation functions are available
#endif
}
static inline void dav1d_free_aligned_internal(void *ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}
#if TRACK_HEAP_ALLOCATIONS
void *dav1d_malloc(enum AllocationType type, size_t sz);
@ -91,34 +124,9 @@ void dav1d_log_alloc_stats(Dav1dContext *c);
#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
#define dav1d_malloc(type, sz) malloc(sz)
#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
#define dav1d_free(ptr) free(ptr)
/*
* Allocate align-byte aligned memory. The return value can be released
* by calling the dav1d_free_aligned() function.
*/
static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
assert(!(align & (align - 1)));
#ifdef _WIN32
return _aligned_malloc(sz, align);
#elif defined(HAVE_POSIX_MEMALIGN)
void *ptr;
if (posix_memalign(&ptr, align, sz)) return NULL;
return ptr;
#else
return memalign(align, sz);
#endif
}
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
static inline void dav1d_free_aligned(void *ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}
#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
#endif /* TRACK_HEAP_ALLOCATIONS */
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);

3
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -119,6 +119,7 @@ if is_asm_enabled
'arm/64/loopfilter16.S',
'arm/64/looprestoration16.S',
'arm/64/mc16.S',
'arm/64/mc16_sve.S',
)
endif
elif host_machine.cpu_family().startswith('arm')
@ -370,7 +371,7 @@ libdav1d = library('dav1d',
)
dav1d_dep = declare_dependency(link_with: libdav1d,
include_directories : include_directories('../include/dav1d')
include_directories : include_directories('../include')
)
#

26
third_party/dav1d/src/picture.c поставляемый
Просмотреть файл

@ -201,16 +201,6 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
(void **) &p->progress);
if (res) return res;
dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
&f->tile[0].data.m);
// Must be removed from the context after being attached to the frame
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
c->n_itut_t35 = 0;
// Don't clear these flags from c->frame_flags if the frame is not going to be output.
// This way they will be added to the next visible frame too.
const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
@ -221,6 +211,22 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
p->visible = f->frame_hdr->show_frame;
p->showable = f->frame_hdr->showable_frame;
if (p->visible) {
// Only add HDR10+ and T35 metadata when show frame flag is enabled
dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
&f->tile[0].data.m);
// Must be removed from the context after being attached to the frame
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
c->n_itut_t35 = 0;
} else {
dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m);
}
if (c->n_fc > 1) {
atomic_init(&p->progress[0], 0);
atomic_init(&p->progress[1], 0);

3
third_party/dav1d/src/ppc/cpu.c поставляемый
Просмотреть файл

@ -29,6 +29,7 @@
#include "common/attributes.h"
#include "src/cpu.h"
#include "src/ppc/cpu.h"
#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
@ -37,7 +38,7 @@
#endif
COLD unsigned dav1d_get_cpu_flags_ppc(void) {
unsigned flags = 0;
unsigned flags = dav1d_get_default_cpu_flags();
#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
unsigned long hw_cap = getauxval(AT_HWCAP);
unsigned long hw_cap2 = getauxval(AT_HWCAP2);

8
third_party/dav1d/src/refmvs.h поставляемый
Просмотреть файл

@ -43,22 +43,26 @@ PACKED(typedef struct refmvs_temporal_block {
mv mv;
int8_t ref;
}) refmvs_temporal_block;
CHECK_SIZE(refmvs_temporal_block, 5);
typedef union refmvs_refpair {
PACKED(typedef union refmvs_refpair {
int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
uint16_t pair;
} refmvs_refpair;
}) ALIGN(refmvs_refpair, 2);
CHECK_SIZE(refmvs_refpair, 2);
typedef union refmvs_mvpair {
mv mv[2];
uint64_t n;
} refmvs_mvpair;
CHECK_SIZE(refmvs_mvpair, 8);
PACKED(typedef struct refmvs_block {
refmvs_mvpair mv;
refmvs_refpair ref;
uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
}) ALIGN(refmvs_block, 4);
CHECK_SIZE(refmvs_block, 12);
typedef struct refmvs_frame {
const Dav1dFrameHeader *frm_hdr;

3
third_party/dav1d/src/riscv/cpu.c поставляемый
Просмотреть файл

@ -29,6 +29,7 @@
#include "common/attributes.h"
#include "src/cpu.h"
#include "src/riscv/cpu.h"
#if defined(HAVE_GETAUXVAL)
@ -41,7 +42,7 @@
int dav1d_has_compliant_rvv(void);
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
unsigned flags = 0;
unsigned flags = dav1d_get_default_cpu_flags();
#if defined(HAVE_GETAUXVAL)
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;

37
third_party/dav1d/src/thread.h поставляемый
Просмотреть файл

@ -132,6 +132,14 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
#else
#include <pthread.h>
#if defined(__FreeBSD__)
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
#define _SYS_PARAM_H_
#include <sys/types.h>
#endif
#ifdef HAVE_PTHREAD_NP_H
#include <pthread_np.h>
#endif
#define dav1d_init_thread() do {} while (0)
@ -145,31 +153,30 @@ static inline void dav1d_set_thread_name(const char *const name) {
prctl(PR_SET_NAME, name);
}
#elif defined(__APPLE__)
#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__APPLE__)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(name);
}
#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
#if defined(__FreeBSD__)
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
#define _SYS_PARAM_H_
#include <sys/types.h>
#endif
#include <pthread_np.h>
static inline void dav1d_set_thread_name(const char *const name) {
pthread_set_name_np(pthread_self(), name);
}
#elif defined(__NetBSD__)
#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__NetBSD__)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(pthread_self(), "%s", (void*)name);
}
#elif defined(HAVE_PTHREAD_SETNAME_NP)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(pthread_self(), name);
}
#elif defined(HAVE_PTHREAD_SET_NAME_NP)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_set_name_np(pthread_self(), name);
}
#elif defined(__HAIKU__)
#include <os/kernel/OS.h>

3
third_party/dav1d/src/x86/cpu.c поставляемый
Просмотреть файл

@ -32,6 +32,7 @@
#include "common/attributes.h"
#include "src/cpu.h"
#include "src/x86/cpu.h"
typedef struct {
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
};
} cpu;
dav1d_cpu_cpuid(&cpu.r, 0, 0);
unsigned flags = 0;
unsigned flags = dav1d_get_default_cpu_flags();
if (cpu.max_leaf >= 1) {
CpuidRegisters r;