зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1916282 - Update libdav1d to 79db1624878fa0f37841ddc2caf86f06738ae275 r=media-playback-reviewers,padenot
This patch updates the libdav1d source by running `./mach vendor media/libdav1d/moz.yaml` Differential Revision: https://phabricator.services.mozilla.com/D221340
This commit is contained in:
Родитель
ac8a2e1e1b
Коммит
1498592440
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 5ef6b241f05a2b9058b58136da4b25842aefba96 (2024-08-04T17:55:20.000-04:00).
|
||||
release: 79db1624878fa0f37841ddc2caf86f06738ae275 (2024-09-06T09:04:24.000+00:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 5ef6b241f05a2b9058b58136da4b25842aefba96
|
||||
revision: 79db1624878fa0f37841ddc2caf86f06738ae275
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "5ef6b241f05a2b9058b58136da4b25842aefba96"
|
||||
#define DAV1D_VERSION "79db1624878fa0f37841ddc2caf86f06738ae275"
|
||||
|
|
|
@ -189,9 +189,13 @@ static inline int clzll(const unsigned long long mask) {
|
|||
#ifndef static_assert
|
||||
#define CHECK_OFFSET(type, field, name) \
|
||||
struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
|
||||
#define CHECK_SIZE(type, size) \
|
||||
struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; }
|
||||
#else
|
||||
#define CHECK_OFFSET(type, field, name) \
|
||||
static_assert(name == offsetof(type, field), #field)
|
||||
#define CHECK_SIZE(type, size) \
|
||||
static_assert(size == sizeof(type), #type)
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
|
|
@ -13,7 +13,9 @@
|
|||
#define __GETOPT_H__
|
||||
|
||||
/* All the headers include this file. */
|
||||
#ifdef _WIN32
|
||||
#include <crtdefs.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -31,10 +31,10 @@
|
|||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "picture.h"
|
||||
#include "data.h"
|
||||
#include "version.h"
|
||||
#include "dav1d/common.h"
|
||||
#include "dav1d/picture.h"
|
||||
#include "dav1d/data.h"
|
||||
#include "dav1d/version.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -157,6 +157,12 @@ else
|
|||
if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
|
||||
cdata.set('HAVE_POSIX_MEMALIGN', 1)
|
||||
endif
|
||||
if cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
|
||||
cdata.set('HAVE_MEMALIGN', 1)
|
||||
endif
|
||||
if cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
|
||||
cdata.set('HAVE_ALIGNED_ALLOC', 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
|
||||
|
@ -209,6 +215,10 @@ if host_machine.cpu_family().startswith('wasm')
|
|||
stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
|
||||
endif
|
||||
|
||||
if cc.check_header('sys/types.h')
|
||||
cdata.set('HAVE_SYS_TYPES_H', 1)
|
||||
endif
|
||||
|
||||
if cc.check_header('unistd.h')
|
||||
cdata.set('HAVE_UNISTD_H', 1)
|
||||
endif
|
||||
|
@ -259,6 +269,12 @@ endif
|
|||
if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
|
||||
cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
|
||||
endif
|
||||
if cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
|
||||
cdata.set('HAVE_PTHREAD_SETNAME_NP', 1)
|
||||
endif
|
||||
if cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
|
||||
cdata.set('HAVE_PTHREAD_SET_NAME_NP', 1)
|
||||
endif
|
||||
|
||||
if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
|
||||
cdata.set('HAVE_C11_GENERIC', 1)
|
||||
|
|
|
@ -884,12 +884,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
|
|||
.else
|
||||
add x4, x1, #FGD_AR_COEFFS_UV
|
||||
.endif
|
||||
adr x16, L(gen_grain_\type\()_tbl)
|
||||
movrel x16, gen_grain_\type\()_tbl
|
||||
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||||
add w9, w9, #4
|
||||
ldrh w17, [x16, w17, uxtw #1]
|
||||
ldrsw x17, [x16, w17, uxtw #2]
|
||||
dup v31.8h, w9 // 4 + data->grain_scale_shift
|
||||
sub x16, x16, w17, uxtw
|
||||
add x16, x16, x17
|
||||
neg v31.8h, v31.8h
|
||||
|
||||
.ifc \type, uv_444
|
||||
|
@ -1075,13 +1075,14 @@ L(generate_grain_\type\()_lag3):
|
|||
ldp x30, x19, [sp], #96
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(gen_grain_\type\()_tbl):
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||||
endfunc
|
||||
|
||||
jumptable gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
|
||||
endjumptable
|
||||
.endm
|
||||
|
||||
gen_grain_82 y
|
||||
|
@ -1118,12 +1119,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
|
|||
ldr w2, [x1, #FGD_SEED]
|
||||
ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
|
||||
add x4, x1, #FGD_AR_COEFFS_UV
|
||||
adr x16, L(gen_grain_\type\()_tbl)
|
||||
movrel x16, gen_grain_\type\()_tbl
|
||||
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||||
add w9, w9, #4
|
||||
ldrh w17, [x16, w17, uxtw #1]
|
||||
ldrsw x17, [x16, w17, uxtw #2]
|
||||
dup v31.8h, w9 // 4 + data->grain_scale_shift
|
||||
sub x16, x16, w17, uxtw
|
||||
add x16, x16, x17
|
||||
neg v31.8h, v31.8h
|
||||
|
||||
cmp w13, #0
|
||||
|
@ -1272,13 +1273,14 @@ L(generate_grain_\type\()_lag3):
|
|||
ldp x30, x19, [sp], #96
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(gen_grain_\type\()_tbl):
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||||
endfunc
|
||||
|
||||
jumptable gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
|
||||
endjumptable
|
||||
.endm
|
||||
|
||||
gen_grain_44 uv_420
|
||||
|
@ -1407,18 +1409,18 @@ function fgy_32x32_8bpc_neon, export=1
|
|||
add_offset x5, w6, x10, x5, x9
|
||||
|
||||
ldr w11, [sp, #24] // type
|
||||
adr x13, L(fgy_loop_tbl)
|
||||
movrel x13, fgy_loop_tbl
|
||||
|
||||
add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
tst w11, #1
|
||||
ldrh w11, [x13, w11, uxtw #1]
|
||||
ldrsw x11, [x13, w11, uxtw #2]
|
||||
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
sub x11, x13, w11, uxtw
|
||||
add x11, x13, x11
|
||||
|
||||
b.eq 1f
|
||||
// y overlap
|
||||
|
@ -1555,14 +1557,15 @@ L(loop_\ox\oy):
|
|||
fgy 0, 1
|
||||
fgy 1, 0
|
||||
fgy 1, 1
|
||||
|
||||
L(fgy_loop_tbl):
|
||||
.hword L(fgy_loop_tbl) - L(loop_00)
|
||||
.hword L(fgy_loop_tbl) - L(loop_01)
|
||||
.hword L(fgy_loop_tbl) - L(loop_10)
|
||||
.hword L(fgy_loop_tbl) - L(loop_11)
|
||||
endfunc
|
||||
|
||||
jumptable fgy_loop_tbl
|
||||
.word L(loop_00) - fgy_loop_tbl
|
||||
.word L(loop_01) - fgy_loop_tbl
|
||||
.word L(loop_10) - fgy_loop_tbl
|
||||
.word L(loop_11) - fgy_loop_tbl
|
||||
endjumptable
|
||||
|
||||
// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
|
||||
// const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
|
@ -1646,11 +1649,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
|
|||
ldr w13, [sp, #64] // type
|
||||
|
||||
movrel x16, overlap_coeffs_\sx
|
||||
adr x14, L(fguv_loop_sx\sx\()_tbl)
|
||||
movrel x14, fguv_loop_sx\sx\()_tbl
|
||||
|
||||
ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
|
||||
tst w13, #1
|
||||
ldrh w13, [x14, w13, uxtw #1]
|
||||
ldrsw x13, [x14, w13, uxtw #2]
|
||||
|
||||
b.eq 1f
|
||||
// y overlap
|
||||
|
@ -1658,7 +1661,7 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
|
|||
mov w9, #(2 >> \sy)
|
||||
|
||||
1:
|
||||
sub x13, x14, w13, uxtw
|
||||
add x13, x14, x13
|
||||
|
||||
.if \sy
|
||||
movi v25.16b, #23
|
||||
|
@ -1848,18 +1851,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
|||
ldr x30, [sp], #32
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx0_tbl):
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
|
||||
endfunc
|
||||
|
||||
jumptable fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
|
||||
endjumptable
|
||||
|
||||
function fguv_loop_sx1_neon
|
||||
.macro fguv_loop_sx1 csfl, ox, oy
|
||||
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
|
@ -1997,14 +2001,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
|||
ldr x30, [sp], #32
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx1_tbl):
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
|
||||
endfunc
|
||||
|
||||
jumptable fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
|
||||
endjumptable
|
||||
|
|
|
@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
|
|||
add x4, x1, #FGD_AR_COEFFS_UV
|
||||
.endif
|
||||
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
|
||||
adr x16, L(gen_grain_\type\()_tbl)
|
||||
movrel x16, gen_grain_\type\()_tbl
|
||||
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||||
add w9, w9, #4
|
||||
ldrh w17, [x16, w17, uxtw #1]
|
||||
ldrsw x17, [x16, w17, uxtw #2]
|
||||
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
|
||||
sub x16, x16, w17, uxtw
|
||||
add x16, x16, x17
|
||||
neg v31.8h, v31.8h
|
||||
|
||||
.ifc \type, uv_444
|
||||
|
@ -945,13 +945,14 @@ L(generate_grain_\type\()_lag3):
|
|||
ldp x30, x19, [sp], #96
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(gen_grain_\type\()_tbl):
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||||
endfunc
|
||||
|
||||
jumptable gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
|
||||
endjumptable
|
||||
.endm
|
||||
|
||||
gen_grain_82 y
|
||||
|
@ -991,12 +992,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
|
|||
ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
|
||||
add x4, x1, #FGD_AR_COEFFS_UV
|
||||
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
|
||||
adr x16, L(gen_grain_\type\()_tbl)
|
||||
movrel x16, gen_grain_\type\()_tbl
|
||||
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||||
add w9, w9, #4
|
||||
ldrh w17, [x16, w17, uxtw #1]
|
||||
ldrsw x17, [x16, w17, uxtw #2]
|
||||
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
|
||||
sub x16, x16, w17, uxtw
|
||||
add x16, x16, x17
|
||||
neg v31.8h, v31.8h
|
||||
|
||||
cmp w13, #0
|
||||
|
@ -1155,13 +1156,14 @@ L(generate_grain_\type\()_lag3):
|
|||
ldp x30, x19, [sp], #96
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(gen_grain_\type\()_tbl):
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||||
.hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||||
endfunc
|
||||
|
||||
jumptable gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
|
||||
.word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
|
||||
endjumptable
|
||||
.endm
|
||||
|
||||
gen_grain_44 uv_420
|
||||
|
@ -1306,18 +1308,18 @@ function fgy_32x32_16bpc_neon, export=1
|
|||
add_offset x5, w6, x10, x5, x9
|
||||
|
||||
ldr w11, [sp, #88] // type
|
||||
adr x13, L(fgy_loop_tbl)
|
||||
movrel x13, fgy_loop_tbl
|
||||
|
||||
add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
tst w11, #1
|
||||
ldrh w11, [x13, w11, uxtw #1]
|
||||
ldrsw x11, [x13, w11, uxtw #2]
|
||||
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
sub x11, x13, w11, uxtw
|
||||
add x11, x13, x11
|
||||
|
||||
b.eq 1f
|
||||
// y overlap
|
||||
|
@ -1480,14 +1482,15 @@ L(loop_\ox\oy):
|
|||
fgy 0, 1
|
||||
fgy 1, 0
|
||||
fgy 1, 1
|
||||
|
||||
L(fgy_loop_tbl):
|
||||
.hword L(fgy_loop_tbl) - L(loop_00)
|
||||
.hword L(fgy_loop_tbl) - L(loop_01)
|
||||
.hword L(fgy_loop_tbl) - L(loop_10)
|
||||
.hword L(fgy_loop_tbl) - L(loop_11)
|
||||
endfunc
|
||||
|
||||
jumptable fgy_loop_tbl
|
||||
.word L(loop_00) - fgy_loop_tbl
|
||||
.word L(loop_01) - fgy_loop_tbl
|
||||
.word L(loop_10) - fgy_loop_tbl
|
||||
.word L(loop_11) - fgy_loop_tbl
|
||||
endjumptable
|
||||
|
||||
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
|
||||
// const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
|
@ -1589,11 +1592,11 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
|||
ldr w13, [sp, #112] // type
|
||||
|
||||
movrel x16, overlap_coeffs_\sx
|
||||
adr x14, L(fguv_loop_sx\sx\()_tbl)
|
||||
movrel x14, fguv_loop_sx\sx\()_tbl
|
||||
|
||||
ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
|
||||
tst w13, #1
|
||||
ldrh w13, [x14, w13, uxtw #1]
|
||||
ldrsw x13, [x14, w13, uxtw #2]
|
||||
|
||||
b.eq 1f
|
||||
// y overlap
|
||||
|
@ -1601,7 +1604,7 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
|||
mov w9, #(2 >> \sy)
|
||||
|
||||
1:
|
||||
sub x13, x14, w13, uxtw
|
||||
add x13, x14, x13
|
||||
|
||||
.if \sy
|
||||
movi v25.8h, #23
|
||||
|
@ -1818,18 +1821,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
|||
ldr x30, [sp], #80
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx0_tbl):
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
|
||||
endfunc
|
||||
|
||||
jumptable fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
|
||||
.word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
|
||||
endjumptable
|
||||
|
||||
function fguv_loop_sx1_neon
|
||||
.macro fguv_loop_sx1 csfl, ox, oy
|
||||
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
|
@ -1984,14 +1988,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
|||
ldr x30, [sp], #80
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx1_tbl):
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
|
||||
endfunc
|
||||
|
||||
jumptable fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
|
||||
.word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
|
||||
endjumptable
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -28,14 +28,77 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
|
||||
// In the comments, let RefTable denote the original, reference table.
|
||||
const x_by_x_tables
|
||||
// RangeMins
|
||||
//
|
||||
// Min(RefTable[i*8:i*8+8])
|
||||
// First two values are zeroed.
|
||||
//
|
||||
// Lookup using RangeMins[(x >> 3)]
|
||||
.byte 0, 0, 11, 8, 6, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2
|
||||
.byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
|
||||
|
||||
// DiffMasks
|
||||
//
|
||||
// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
|
||||
// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
|
||||
// RefTable changes at that particular index.
|
||||
// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
|
||||
// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
|
||||
//
|
||||
// Lookup using DiffMasks[(x >> 3)]
|
||||
.byte 0x00, 0x00, 0xD4, 0x44
|
||||
.byte 0x42, 0x04, 0x00, 0x00
|
||||
.byte 0x00, 0x80, 0x00, 0x00
|
||||
.byte 0x04, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x40, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x02
|
||||
// Binary form:
|
||||
// 0b00000000, 0b00000000, 0b11010100, 0b01000100
|
||||
// 0b01000010, 0b00000100, 0b00000000, 0b00000000
|
||||
// 0b00000000, 0b10000000, 0b00000000, 0b00000000
|
||||
// 0b00000100, 0b00000000, 0b00000000, 0b00000000
|
||||
// 0b00000000, 0b00000000, 0b00000000, 0b00000000
|
||||
// 0b00000000, 0b01000000, 0b00000000, 0b00000000
|
||||
// 0b00000000, 0b00000000, 0b00000000, 0b00000000
|
||||
// 0b00000000, 0b00000000, 0b00000000, 0b00000010
|
||||
|
||||
// RefLo
|
||||
//
|
||||
// RefTable[0:16]
|
||||
// i.e. First 16 elements of the original table.
|
||||
// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
|
||||
//
|
||||
// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
|
||||
.byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
|
||||
|
||||
// Pseudo assembly
|
||||
//
|
||||
// hi_bits = x >> 3
|
||||
// tbl ref, {RefLo}, x
|
||||
// tbl diffs, {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
|
||||
// tbl min, {RangeMins[0:16], RangeMins[16:32]}, hi_bits
|
||||
// lo_bits = x & 0x7
|
||||
// diffs = diffs << lo_bits
|
||||
// ref = ref + min
|
||||
// integral = popcnt(diffs)
|
||||
// ref = ref + integral
|
||||
// return ref
|
||||
endconst
|
||||
|
||||
// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
// int32_t *AA, int16_t *BB,
|
||||
// const int w, const int s,
|
||||
// const int bitdepth_max);
|
||||
function sgr_box3_vert_neon, export=1
|
||||
stp d8, d9, [sp, #-0x30]!
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
|
||||
add w4, w4, #2
|
||||
clz w9, w6 // bitdepth_max
|
||||
|
@ -49,41 +112,176 @@ function sgr_box3_vert_neon, export=1
|
|||
movi v31.4s, #9 // n
|
||||
|
||||
sub w9, w9, #24 // -bitdepth_min_8
|
||||
movrel x12, X(sgr_x_by_x)
|
||||
movrel x12, x_by_x_tables
|
||||
mov w13, #455 // one_by_x
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
|
||||
movi v22.16b, #0x7
|
||||
ldr q23, [x12, #64] //RefLo
|
||||
dup v6.8h, w9 // -bitdepth_min_8
|
||||
movi v19.16b, #5
|
||||
movi v20.8b, #55 // idx of last 5
|
||||
movi v21.8b, #72 // idx of last 4
|
||||
movi v22.8b, #101 // idx of last 3
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v30.4s, w13 // one_by_x
|
||||
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x5], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
|
||||
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||
ld1 {v20.8h, v21.8h}, [x8], #32
|
||||
ld1 {v0.8h, v1.8h}, [x7], #32
|
||||
1:
|
||||
ld1 {v2.8h, v3.8h}, [x1], #32
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
add v0.8h, v0.8h, v20.8h
|
||||
add v1.8h, v1.8h, v21.8h
|
||||
|
||||
add v16.4s, v16.4s, v8.4s
|
||||
add v17.4s, v17.4s, v9.4s
|
||||
add v18.4s, v18.4s, v10.4s
|
||||
add v19.4s, v19.4s, v11.4s
|
||||
add v4.8h, v2.8h, v0.8h
|
||||
add v5.8h, v3.8h, v1.8h
|
||||
|
||||
srshl v16.4s, v16.4s, v7.4s
|
||||
srshl v17.4s, v17.4s, v7.4s
|
||||
srshl v18.4s, v18.4s, v7.4s
|
||||
srshl v19.4s, v19.4s, v7.4s
|
||||
srshl v9.8h, v4.8h, v6.8h
|
||||
srshl v13.8h, v5.8h, v6.8h
|
||||
mul v16.4s, v16.4s, v31.4s // a * n
|
||||
mul v17.4s, v17.4s, v31.4s // a * n
|
||||
mul v18.4s, v18.4s, v31.4s // a * n
|
||||
mul v19.4s, v19.4s, v31.4s // a * n
|
||||
umull v8.4s, v9.4h, v9.4h // b * b
|
||||
umull2 v9.4s, v9.8h, v9.8h // b * b
|
||||
umull v12.4s, v13.4h, v13.4h // b * b
|
||||
umull2 v13.4s, v13.8h, v13.8h // b * b
|
||||
uqsub v16.4s, v16.4s, v8.4s // imax(a * n - b * b, 0)
|
||||
uqsub v17.4s, v17.4s, v9.4s // imax(a * n - b * b, 0)
|
||||
uqsub v18.4s, v18.4s, v12.4s // imax(a * n - b * b, 0)
|
||||
uqsub v19.4s, v19.4s, v13.4s // imax(a * n - b * b, 0)
|
||||
mul v16.4s, v16.4s, v28.4s // p * s
|
||||
mul v17.4s, v17.4s, v28.4s // p * s
|
||||
mul v18.4s, v18.4s, v28.4s // p * s
|
||||
mul v19.4s, v19.4s, v28.4s // p * s
|
||||
uqshrn v16.4h, v16.4s, #16
|
||||
uqshrn2 v16.8h, v17.4s, #16
|
||||
uqshrn v18.4h, v18.4s, #16
|
||||
uqshrn2 v18.8h, v19.4s, #16
|
||||
uqrshrn v1.8b, v16.8h, #4 // imin(z, 255)
|
||||
uqrshrn2 v1.16b, v18.8h, #4 // imin(z, 255)
|
||||
|
||||
ld1 {v16.4s, v17.4s}, [x0], #32
|
||||
subs w4, w4, #16
|
||||
|
||||
ushr v0.16b, v1.16b, #3
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
tbl v2.16b, {v26.16b, v27.16b}, v0.16b // RangeMins
|
||||
tbl v0.16b, {v24.16b, v25.16b}, v0.16b // DiffMasks
|
||||
tbl v3.16b, {v23.16b}, v1.16b // RefLo
|
||||
and v1.16b, v1.16b, v22.16b
|
||||
ld1 {v12.4s, v13.4s}, [x6], #32
|
||||
ushl v1.16b, v2.16b, v1.16b
|
||||
ld1 {v20.8h, v21.8h}, [x8], #32
|
||||
add v3.16b, v3.16b, v0.16b
|
||||
cnt v1.16b, v1.16b
|
||||
ld1 {v18.4s, v19.4s}, [x0], #32
|
||||
add v3.16b, v3.16b, v1.16b
|
||||
ld1 {v10.4s, v11.4s}, [x5], #32
|
||||
uxtl v0.8h, v3.8b // x
|
||||
uxtl2 v1.8h, v3.16b // x
|
||||
|
||||
ld1 {v14.4s, v15.4s}, [x6], #32
|
||||
|
||||
umull v2.4s, v0.4h, v4.4h // x * BB[i]
|
||||
umull2 v3.4s, v0.8h, v4.8h // x * BB[i]
|
||||
umull v4.4s, v1.4h, v5.4h // x * BB[i]
|
||||
umull2 v5.4s, v1.8h, v5.8h // x * BB[i]
|
||||
sub v0.8h, v29.8h, v0.8h // 256 - x
|
||||
sub v1.8h, v29.8h, v1.8h // 256 - x
|
||||
mul v2.4s, v2.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v5.4s, v5.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
st1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v0.8h, v1.8h}, [x7], #32
|
||||
srshr v2.4s, v2.4s, #12 // AA[i]
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
srshr v5.4s, v5.4s, #12 // AA[i]
|
||||
|
||||
st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
|
||||
b.gt 1b
|
||||
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
// int32_t *AA, int16_t *BB,
|
||||
// const int w, const int s,
|
||||
// const int bitdepth_max);
|
||||
function sgr_box5_vert_neon, export=1
|
||||
stp d8, d9, [sp, #-0x30]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
|
||||
add w4, w4, #2
|
||||
clz w15, w6 // bitdepth_max
|
||||
dup v28.4s, w5 // strength
|
||||
|
||||
ldp x5, x6, [x0]
|
||||
ldp x7, x8, [x0, #16]
|
||||
ldr x0, [x0, #32]
|
||||
ldp x9, x10, [x1]
|
||||
ldp x11, x12, [x1, #16]
|
||||
ldr x1, [x1, #32]
|
||||
|
||||
movi v31.4s, #25 // n
|
||||
|
||||
sub w15, w15, #24 // -bitdepth_min_8
|
||||
movrel x13, x_by_x_tables
|
||||
movi v30.4s, #164
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
|
||||
dup v6.8h, w15 // -bitdepth_min_8
|
||||
movi v19.8b, #0x7
|
||||
ldr q18, [x13, #64] // RefLo
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
movi v29.8h, #1, lsl #8
|
||||
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
ld1 {v12.8h}, [x7], #16
|
||||
ld1 {v13.8h}, [x8], #16
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
ld1 {v16.4s, v17.4s}, [x8], #32
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
1:
|
||||
|
||||
1:
|
||||
add v8.4s, v8.4s, v10.4s
|
||||
add v9.4s, v9.4s, v11.4s
|
||||
add v12.4s, v12.4s, v16.4s
|
||||
add v13.4s, v13.4s, v17.4s
|
||||
|
||||
add v12.8h, v12.8h, v13.8h
|
||||
add v20.8h, v20.8h, v21.8h
|
||||
add v22.8h, v22.8h, v23.8h
|
||||
|
||||
subs w4, w4, #8
|
||||
add v0.4s, v0.4s, v8.4s
|
||||
add v1.4s, v1.4s, v9.4s
|
||||
add v2.8h, v2.8h, v12.8h
|
||||
add v2.8h, v2.8h, v20.8h
|
||||
|
||||
add v0.4s, v0.4s, v12.4s
|
||||
add v1.4s, v1.4s, v13.4s
|
||||
add v2.8h, v2.8h, v22.8h
|
||||
|
||||
subs w4, w4, #8
|
||||
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
|
@ -102,24 +300,25 @@ function sgr_box3_vert_neon, export=1
|
|||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
ld1 {v12.8h}, [x7], #16
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
add v5.8b, v1.8b, v5.8b
|
||||
ld1 {v13.8h}, [x8], #16
|
||||
add v5.8b, v5.8b, v25.8b
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ushr v1.8b, v0.8b, #3
|
||||
ld1 {v16.4s, v17.4s}, [x8], #32
|
||||
tbl v5.8b, {v26.16b, v27.16b}, v1.8b // RangeMins
|
||||
tbl v1.8b, {v24.16b, v25.16b}, v1.8b // DiffMasks
|
||||
tbl v4.8b, {v18.16b}, v0.8b // RefLo
|
||||
and v0.8b, v0.8b, v19.8b
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
ushl v5.8b, v5.8b, v0.8b
|
||||
add v4.8b, v4.8b, v1.8b
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
cnt v5.8b, v5.8b
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
add v5.8b, v4.8b, v5.8b
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
uxtl v5.8h, v5.8b // x
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
umull v3.4s, v5.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
|
@ -138,135 +337,3 @@ function sgr_box3_vert_neon, export=1
|
|||
ldp d8, d9, [sp], 0x30
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
// int32_t *AA, int16_t *BB,
|
||||
// const int w, const int s,
|
||||
// const int bitdepth_max);
|
||||
function sgr_box5_vert_neon, export=1
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
|
||||
add w4, w4, #2
|
||||
clz w15, w6 // bitdepth_max
|
||||
dup v28.4s, w5 // strength
|
||||
|
||||
ldp x5, x6, [x0]
|
||||
ldp x7, x8, [x0, #16]
|
||||
ldr x0, [x0, #32]
|
||||
ldp x9, x10, [x1]
|
||||
ldp x11, x12, [x1, #16]
|
||||
ldr x1, [x1, #32]
|
||||
|
||||
movi v31.4s, #25 // n
|
||||
|
||||
sub w15, w15, #24 // -bitdepth_min_8
|
||||
movrel x13, X(sgr_x_by_x)
|
||||
mov w14, #164 // one_by_x
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x13]
|
||||
dup v6.8h, w15 // -bitdepth_min_8
|
||||
movi v19.16b, #5
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v30.4s, w14 // one_by_x
|
||||
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
ld1 {v14.4s, v15.4s}, [x8], #32
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
|
||||
1:
|
||||
add v8.4s, v8.4s, v10.4s
|
||||
add v9.4s, v9.4s, v11.4s
|
||||
add v12.4s, v12.4s, v14.4s
|
||||
add v13.4s, v13.4s, v15.4s
|
||||
|
||||
add v20.8h, v20.8h, v21.8h
|
||||
add v22.8h, v22.8h, v23.8h
|
||||
|
||||
add v0.4s, v0.4s, v8.4s
|
||||
add v1.4s, v1.4s, v9.4s
|
||||
add v2.8h, v2.8h, v20.8h
|
||||
|
||||
add v0.4s, v0.4s, v12.4s
|
||||
add v1.4s, v1.4s, v13.4s
|
||||
add v2.8h, v2.8h, v22.8h
|
||||
|
||||
subs w4, w4, #8
|
||||
|
||||
movi v20.8b, #55 // idx of last 5
|
||||
movi v21.8b, #72 // idx of last 4
|
||||
movi v22.8b, #101 // idx of last 3
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
srshl v4.8h, v2.8h, v6.8h
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v4.4h, v4.4h // b * b
|
||||
umull2 v4.4s, v4.8h, v4.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
ld1 {v14.4s, v15.4s}, [x8], #32
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
add v5.8b, v1.8b, v5.8b
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
add v5.8b, v5.8b, v25.8b
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
uxtl v5.8h, v5.8b // x
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
umull v3.4s, v5.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v5.8h, v29.8h, v5.8h // 256 - x
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x2], #32
|
||||
st1 {v5.8h}, [x3], #16
|
||||
b.gt 1b
|
||||
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
ret
|
||||
endfunc
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -54,8 +54,14 @@ const h_tbl_neon_dotprod, align=4
|
|||
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
|
||||
.byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
|
||||
|
||||
// Shuffle indices to permute horizontal samples in preparation for
|
||||
// input to USMMLA instructions.
|
||||
#define OFFSET_USMMLA 48
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
.byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
|
||||
|
||||
// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
|
||||
#define OFFSET_CVT_32_8 48
|
||||
#define OFFSET_CVT_32_8 80
|
||||
.byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
||||
endconst
|
||||
|
||||
|
@ -114,10 +120,10 @@ L(\type\()_8tap_v_\isa):
|
|||
sub \src, \src, \s_strd
|
||||
.ifc \isa, neon_dotprod
|
||||
.ifc \type, prep
|
||||
mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding
|
||||
mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding
|
||||
dup v4.4s, w8
|
||||
.else
|
||||
movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT
|
||||
movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT
|
||||
.endif
|
||||
.endif
|
||||
ubfx w11, \my, #7, #7
|
||||
|
@ -677,18 +683,18 @@ L(\type\()_8tap_h_hv_\isa):
|
|||
madd \mx, \mx, w11, w9
|
||||
madd w14, \my, w11, w10 // for HV
|
||||
.ifc \isa, neon_dotprod
|
||||
mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding
|
||||
mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding
|
||||
dup v27.4s, w13 // put H overrides this
|
||||
.endif
|
||||
movrel x13, h_tbl_neon_dotprod
|
||||
sub \src, \src, #3 // src - 3
|
||||
ldr q28, [x13]
|
||||
ubfx w9, \mx, #7, #7
|
||||
ldr q28, [x13] // for 4-tap & 8-tap H filters
|
||||
ubfx w15, \mx, #7, #7
|
||||
and \mx, \mx, #0x7F
|
||||
ubfx w11, w14, #7, #7 // for HV
|
||||
and w14, w14, #0x7F // for HV
|
||||
cmp \w, #4
|
||||
csel \mx, \mx, w9, le
|
||||
csel \mx, \mx, w15, le
|
||||
add \xmx, x12, \xmx, lsl #3 // subpel H filter address
|
||||
.ifc \isa, neon_dotprod
|
||||
movi v24.16b, #128
|
||||
|
@ -706,7 +712,7 @@ L(\type\()_8tap_h_hv_\isa):
|
|||
ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
|
||||
.endif // of 32b values to 8b
|
||||
sxtl v7.8h, v7.8b
|
||||
cmp w10, SHARP1
|
||||
cmp w10, #SHARP1
|
||||
b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1
|
||||
|
||||
// HV 8-tap cases
|
||||
|
@ -1005,11 +1011,92 @@ L(\type\()_6tap_hv_\isa):
|
|||
|
||||
// .align JUMP_ALIGN // fallthrough
|
||||
80: // HV6 - 8xN+
|
||||
ldp q29, q30, [x13, #16]
|
||||
ldr d26, [\xmx]
|
||||
.ifc \type, prep
|
||||
add \wd_strd, \w, \w
|
||||
.endif
|
||||
.ifc \isa, neon_i8mm
|
||||
cmp w9, #SHARP1
|
||||
b.eq 88f // horizontal == SHARP1
|
||||
|
||||
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
|
||||
ext v0.8b, v26.8b, v26.8b, #7
|
||||
ins v26.d[1], v0.d[0]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
81:
|
||||
mov \lsrc, \src
|
||||
mov \ldst, \dst
|
||||
mov w8, \h
|
||||
|
||||
bl L(\type\()_hv_filter6_neon_i8mm)
|
||||
srshr v16.8h, v22.8h, #2
|
||||
bl L(\type\()_hv_filter6_neon_i8mm)
|
||||
srshr v17.8h, v22.8h, #2
|
||||
bl L(\type\()_hv_filter6_neon_i8mm)
|
||||
srshr v18.8h, v22.8h, #2
|
||||
bl L(\type\()_hv_filter6_neon_i8mm)
|
||||
srshr v19.8h, v22.8h, #2
|
||||
bl L(\type\()_hv_filter6_neon_i8mm)
|
||||
srshr v20.8h, v22.8h, #2
|
||||
|
||||
.align LOOP_ALIGN
|
||||
8:
|
||||
ld1 {v23.16b}, [\lsrc], \s_strd
|
||||
|
||||
smull v0.4s, v16.4h, v7.h[1]
|
||||
smull2 v1.4s, v16.8h, v7.h[1]
|
||||
mov v16.16b, v17.16b
|
||||
movi v5.4s, #0
|
||||
movi v6.4s, #0
|
||||
tbl v2.16b, {v23.16b}, v29.16b
|
||||
tbl v3.16b, {v23.16b}, v30.16b
|
||||
|
||||
smlal v0.4s, v17.4h, v7.h[2]
|
||||
smlal2 v1.4s, v17.8h, v7.h[2]
|
||||
mov v17.16b, v18.16b
|
||||
|
||||
usmmla v5.4s, v2.16b, v26.16b
|
||||
usmmla v6.4s, v3.16b, v26.16b
|
||||
|
||||
smlal v0.4s, v18.4h, v7.h[3]
|
||||
smlal2 v1.4s, v18.8h, v7.h[3]
|
||||
mov v18.16b, v19.16b
|
||||
subs w8, w8, #1
|
||||
|
||||
smlal v0.4s, v19.4h, v7.h[4]
|
||||
smlal2 v1.4s, v19.8h, v7.h[4]
|
||||
uzp1 v23.8h, v5.8h, v6.8h
|
||||
mov v19.16b, v20.16b
|
||||
|
||||
smlal v0.4s, v20.4h, v7.h[5]
|
||||
smlal2 v1.4s, v20.8h, v7.h[5]
|
||||
srshr v20.8h, v23.8h, #2
|
||||
smlal v0.4s, v20.4h, v7.h[6]
|
||||
smlal2 v1.4s, v20.8h, v7.h[6]
|
||||
.ifc \type, prep
|
||||
rshrn v0.4h, v0.4s, #6
|
||||
rshrn2 v0.8h, v1.4s, #6
|
||||
st1 {v0.8h}, [\ldst], \d_strd
|
||||
b.gt 8b
|
||||
add \dst, \dst, #16
|
||||
.else
|
||||
tbl v0.16b, {v0.16b, v1.16b}, v25.16b
|
||||
sqrshrun v0.8b, v0.8h, #2
|
||||
st1 {v0.8b}, [\ldst], \d_strd
|
||||
b.gt 8b
|
||||
add \dst, \dst, #8
|
||||
.endif
|
||||
add \src, \src, #8
|
||||
subs \w, \w, #8
|
||||
b.gt 81b
|
||||
ret x15
|
||||
|
||||
.align JUMP_ALIGN
|
||||
88:
|
||||
.endif // neon_i8mm
|
||||
ldp q29, q30, [x13, #16]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
81:
|
||||
mov \lsrc, \src
|
||||
|
@ -1040,8 +1127,8 @@ L(\type\()_6tap_hv_\isa):
|
|||
.endif
|
||||
.align LOOP_ALIGN
|
||||
8:
|
||||
ldr q23, [\xmy]
|
||||
add \xmy, \xmy, \s_strd
|
||||
ldr q23, [\lsrc]
|
||||
add \lsrc, \lsrc, \s_strd
|
||||
|
||||
smull v0.4s, v16.4h, v7.h[1]
|
||||
smull2 v1.4s, v16.8h, v7.h[1]
|
||||
|
@ -1128,6 +1215,20 @@ L(\type\()_hv_filter8_\isa):
|
|||
uzp1 v22.8h, v22.8h, v23.8h
|
||||
ret
|
||||
|
||||
.ifc \isa, neon_i8mm
|
||||
.align FUNC_ALIGN
|
||||
L(\type\()_hv_filter6_neon_i8mm):
|
||||
ld1 {v4.16b}, [\lsrc], \s_strd
|
||||
movi v22.4s, #0
|
||||
movi v23.4s, #0
|
||||
tbl v2.16b, {v4.16b}, v29.16b
|
||||
tbl v3.16b, {v4.16b}, v30.16b
|
||||
usmmla v22.4s, v2.16b, v26.16b
|
||||
usmmla v23.4s, v3.16b, v26.16b
|
||||
uzp1 v22.8h, v22.8h, v23.8h
|
||||
ret
|
||||
.endif
|
||||
|
||||
.align FUNC_ALIGN
|
||||
L(\type\()_hv_filter4_\isa):
|
||||
ld1 {v4.8b}, [\src], \s_strd
|
||||
|
@ -1264,8 +1365,8 @@ L(\type\()_hv_filter4_\isa):
|
|||
|
||||
.align JUMP_ALIGN
|
||||
L(\type\()_8tap_h_\isa):
|
||||
adr x9, L(\type\()_8tap_h_\isa\()_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
movrel x11, \type\()_8tap_h_\isa\()_tbl
|
||||
ldrsw x8, [x11, x8, lsl #2]
|
||||
.ifc \type, put
|
||||
.ifc \isa, neon_i8mm
|
||||
movi v27.4s, #34 // special rounding
|
||||
|
@ -1274,8 +1375,8 @@ L(\type\()_8tap_h_\isa):
|
|||
dup v27.4s, w10
|
||||
.endif
|
||||
.endif
|
||||
sub x9, x9, x8
|
||||
br x9
|
||||
add x11, x11, x8
|
||||
br x11
|
||||
|
||||
.ifc \type, put
|
||||
.align JUMP_ALIGN
|
||||
|
@ -1368,8 +1469,63 @@ L(\type\()_8tap_h_\isa):
|
|||
.align JUMP_ALIGN
|
||||
80: // H - 8xN
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp q29, q30, [x13, #16]
|
||||
ldr d26, [\xmx]
|
||||
.ifc \isa, neon_i8mm
|
||||
cmp w9, #SHARP1
|
||||
b.eq 88f // horizontal == SHARP1
|
||||
|
||||
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
|
||||
ext v0.8b, v26.8b, v26.8b, #7
|
||||
ins v26.d[1], v0.d[0]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
8:
|
||||
ldr q0, [\src]
|
||||
ldr q16, [\src, \s_strd]
|
||||
add \src, \src, \s_strd, lsl #1
|
||||
.ifc \type, prep
|
||||
movi v4.4s, #0
|
||||
movi v5.4s, #0
|
||||
movi v20.4s, #0
|
||||
movi v21.4s, #0
|
||||
.else
|
||||
mov v4.16b, v27.16b
|
||||
mov v5.16b, v27.16b
|
||||
mov v20.16b, v27.16b
|
||||
mov v21.16b, v27.16b
|
||||
.endif
|
||||
tbl v1.16b, {v0.16b}, v29.16b
|
||||
tbl v2.16b, {v0.16b}, v30.16b
|
||||
tbl v17.16b, {v16.16b}, v29.16b
|
||||
tbl v18.16b, {v16.16b}, v30.16b
|
||||
|
||||
usmmla v4.4s, v1.16b, v26.16b
|
||||
usmmla v5.4s, v2.16b, v26.16b
|
||||
usmmla v20.4s, v17.16b, v26.16b
|
||||
usmmla v21.4s, v18.16b, v26.16b
|
||||
|
||||
uzp1 v4.8h, v4.8h, v5.8h
|
||||
uzp1 v20.8h, v20.8h, v21.8h
|
||||
.ifc \type, prep
|
||||
srshr v4.8h, v4.8h, #2
|
||||
srshr v20.8h, v20.8h, #2
|
||||
subs \h, \h, #2
|
||||
stp q4, q20, [\dst], #32
|
||||
.else // put
|
||||
sqshrun v4.8b, v4.8h, #6
|
||||
sqshrun v20.8b, v20.8h, #6
|
||||
subs \h, \h, #2
|
||||
str d4, [\dst]
|
||||
str d20, [\dst, \d_strd]
|
||||
add \dst, \dst, \d_strd, lsl #1
|
||||
.endif
|
||||
b.gt 8b
|
||||
ret
|
||||
|
||||
.align JUMP_ALIGN
|
||||
88:
|
||||
.endif // neon_i8mm
|
||||
ldp q29, q30, [x13, #16]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
8:
|
||||
|
@ -1433,8 +1589,61 @@ L(\type\()_8tap_h_\isa):
|
|||
.align JUMP_ALIGN
|
||||
160: // H - 16xN
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp q29, q30, [x13, #16]
|
||||
ldr d26, [\xmx]
|
||||
.ifc \isa, neon_i8mm
|
||||
cmp w9, #SHARP1
|
||||
b.eq 168f // horizontal == SHARP1
|
||||
|
||||
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
|
||||
ext v0.8b, v26.8b, v26.8b, #7
|
||||
ins v26.d[1], v0.d[0]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
16:
|
||||
ldr q16, [\src]
|
||||
ldur q17, [\src, #8] // avoid 2 register TBL for small cores
|
||||
add \src, \src, \s_strd
|
||||
.ifc \type, prep
|
||||
movi v6.4s, #0
|
||||
movi v7.4s, #0
|
||||
movi v22.4s, #0
|
||||
movi v23.4s, #0
|
||||
.else
|
||||
mov v6.16b, v27.16b
|
||||
mov v7.16b, v27.16b
|
||||
mov v22.16b, v27.16b
|
||||
mov v23.16b, v27.16b
|
||||
.endif
|
||||
tbl v0.16b, {v16.16b}, v29.16b
|
||||
tbl v1.16b, {v16.16b}, v30.16b
|
||||
tbl v2.16b, {v17.16b}, v29.16b
|
||||
tbl v3.16b, {v17.16b}, v30.16b
|
||||
|
||||
usmmla v6.4s, v0.16b, v26.16b
|
||||
usmmla v7.4s, v1.16b, v26.16b
|
||||
usmmla v22.4s, v2.16b, v26.16b
|
||||
usmmla v23.4s, v3.16b, v26.16b
|
||||
|
||||
uzp1 v6.8h, v6.8h, v7.8h
|
||||
uzp1 v22.8h, v22.8h, v23.8h
|
||||
.ifc \type, prep
|
||||
srshr v6.8h, v6.8h, #2
|
||||
srshr v22.8h, v22.8h, #2
|
||||
subs \h, \h, #1
|
||||
stp q6, q22, [\dst], #32
|
||||
.else // put
|
||||
sqshrun v6.8b, v6.8h, #6
|
||||
sqshrun2 v6.16b, v22.8h, #6
|
||||
subs \h, \h, #1
|
||||
st1 {v6.16b}, [\dst], \d_strd
|
||||
.endif
|
||||
b.gt 16b
|
||||
ret
|
||||
|
||||
.align JUMP_ALIGN
|
||||
168:
|
||||
.endif // neon_i8mm
|
||||
ldp q29, q30, [x13, #16]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
16:
|
||||
|
@ -1497,7 +1706,6 @@ L(\type\()_8tap_h_\isa):
|
|||
640:
|
||||
1280:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp q29, q30, [x13, #16]
|
||||
ldr d26, [\xmx]
|
||||
.ifc \type, put
|
||||
sub \d_strd, \d_strd, \w, uxtw
|
||||
|
@ -1505,6 +1713,69 @@ L(\type\()_8tap_h_\isa):
|
|||
sub \s_strd, \s_strd, \w, uxtw
|
||||
mov w8, \w
|
||||
|
||||
.ifc \isa, neon_i8mm
|
||||
cmp w9, #SHARP1
|
||||
b.eq 328f // horizontal == SHARP1
|
||||
|
||||
ldp q29, q30, [x13, #(OFFSET_USMMLA)]
|
||||
ext v0.8b, v26.8b, v26.8b, #7
|
||||
ins v26.d[1], v0.d[0]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
32:
|
||||
ldr q16, [\src]
|
||||
ldur q17, [\src, #8] // avoid 2 register TBL for small cores
|
||||
add \src, \src, #16
|
||||
.ifc \type, prep
|
||||
movi v6.4s, #0
|
||||
movi v7.4s, #0
|
||||
movi v22.4s, #0
|
||||
movi v23.4s, #0
|
||||
.else
|
||||
mov v6.16b, v27.16b
|
||||
mov v7.16b, v27.16b
|
||||
mov v22.16b, v27.16b
|
||||
mov v23.16b, v27.16b
|
||||
.endif
|
||||
tbl v0.16b, {v16.16b}, v29.16b
|
||||
tbl v1.16b, {v16.16b}, v30.16b
|
||||
tbl v2.16b, {v17.16b}, v29.16b
|
||||
tbl v3.16b, {v17.16b}, v30.16b
|
||||
|
||||
usmmla v6.4s, v0.16b, v26.16b
|
||||
usmmla v7.4s, v1.16b, v26.16b
|
||||
usmmla v22.4s, v2.16b, v26.16b
|
||||
usmmla v23.4s, v3.16b, v26.16b
|
||||
|
||||
uzp1 v6.8h, v6.8h, v7.8h
|
||||
uzp1 v22.8h, v22.8h, v23.8h
|
||||
.ifc \type, prep
|
||||
srshr v6.8h, v6.8h, #2
|
||||
srshr v22.8h, v22.8h, #2
|
||||
subs w8, w8, #16
|
||||
stp q6, q22, [\dst], #32
|
||||
.else // put
|
||||
sqshrun v6.8b, v6.8h, #6
|
||||
sqshrun2 v6.16b, v22.8h, #6
|
||||
subs w8, w8, #16
|
||||
str q6, [\dst], #16
|
||||
.endif
|
||||
b.gt 32b
|
||||
|
||||
add \src, \src, \s_strd
|
||||
.ifc \type, put
|
||||
add \dst, \dst, \d_strd
|
||||
.endif
|
||||
mov w8, \w
|
||||
subs \h, \h, #1
|
||||
b.gt 32b
|
||||
ret
|
||||
|
||||
.align JUMP_ALIGN
|
||||
328:
|
||||
.endif // neon_i8mm
|
||||
ldp q29, q30, [x13, #16]
|
||||
|
||||
.align LOOP_ALIGN
|
||||
32:
|
||||
ldr q16, [\src]
|
||||
|
@ -1568,19 +1839,19 @@ L(\type\()_8tap_h_\isa):
|
|||
subs \h, \h, #1
|
||||
b.gt 32b
|
||||
ret
|
||||
|
||||
L(\type\()_8tap_h_\isa\()_tbl):
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
|
||||
.ifc \type, put
|
||||
.hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
|
||||
.hword 0
|
||||
.endif
|
||||
endfunc
|
||||
|
||||
jumptable \type\()_8tap_h_\isa\()_tbl
|
||||
.word 1280b - \type\()_8tap_h_\isa\()_tbl
|
||||
.word 640b - \type\()_8tap_h_\isa\()_tbl
|
||||
.word 320b - \type\()_8tap_h_\isa\()_tbl
|
||||
.word 160b - \type\()_8tap_h_\isa\()_tbl
|
||||
.word 80b - \type\()_8tap_h_\isa\()_tbl
|
||||
.word 40b - \type\()_8tap_h_\isa\()_tbl
|
||||
.ifc \type, put
|
||||
.word 20b - \type\()_8tap_h_\isa\()_tbl
|
||||
.endif
|
||||
endjumptable
|
||||
.endm
|
||||
|
||||
// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
|
||||
|
|
|
@ -34,13 +34,13 @@
|
|||
function splat_mv_neon, export=1
|
||||
ld1 {v3.16b}, [x1]
|
||||
clz w3, w3
|
||||
adr x5, L(splat_tbl)
|
||||
movrel x5, splat_tbl
|
||||
sub w3, w3, #26
|
||||
ext v2.16b, v3.16b, v3.16b, #12
|
||||
ldrh w3, [x5, w3, uxtw #1]
|
||||
ldrsw x3, [x5, w3, uxtw #2]
|
||||
add w2, w2, w2, lsl #1
|
||||
ext v0.16b, v2.16b, v3.16b, #4
|
||||
sub x3, x5, w3, uxtw
|
||||
add x3, x5, x3
|
||||
ext v1.16b, v2.16b, v3.16b, #8
|
||||
lsl w2, w2, #2
|
||||
ext v2.16b, v2.16b, v3.16b, #12
|
||||
|
@ -80,16 +80,17 @@ function splat_mv_neon, export=1
|
|||
st1 {v0.16b, v1.16b, v2.16b}, [x1]
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
L(splat_tbl):
|
||||
.hword L(splat_tbl) - 320b
|
||||
.hword L(splat_tbl) - 160b
|
||||
.hword L(splat_tbl) - 80b
|
||||
.hword L(splat_tbl) - 40b
|
||||
.hword L(splat_tbl) - 20b
|
||||
.hword L(splat_tbl) - 10b
|
||||
endfunc
|
||||
|
||||
jumptable splat_tbl
|
||||
.word 320b - splat_tbl
|
||||
.word 160b - splat_tbl
|
||||
.word 80b - splat_tbl
|
||||
.word 40b - splat_tbl
|
||||
.word 20b - splat_tbl
|
||||
.word 10b - splat_tbl
|
||||
endjumptable
|
||||
|
||||
const mv_tbls, align=4
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
|
||||
|
@ -112,7 +113,7 @@ function save_tmvs_neon, export=1
|
|||
|
||||
movi v30.8b, #0
|
||||
ld1 {v31.8b}, [x3]
|
||||
adr x8, L(save_tmvs_tbl)
|
||||
movrel x8, save_tmvs_tbl
|
||||
movrel x16, mask_mult
|
||||
movrel x13, mv_tbls
|
||||
ld1 {v29.8b}, [x16]
|
||||
|
@ -137,9 +138,9 @@ function save_tmvs_neon, export=1
|
|||
2:
|
||||
ldrb w11, [x9, #10] // cand_b->bs
|
||||
ld1 {v0.16b}, [x9] // cand_b->mv
|
||||
add x11, x8, w11, uxtw #2
|
||||
add x11, x8, w11, uxtw #3
|
||||
ldr h1, [x9, #8] // cand_b->ref
|
||||
ldrh w12, [x11] // bw8
|
||||
ldr w12, [x11] // bw8
|
||||
mov x15, x8
|
||||
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
|
||||
cmp x9, x10
|
||||
|
@ -149,9 +150,9 @@ function save_tmvs_neon, export=1
|
|||
ldrb w15, [x9, #10] // cand_b->bs
|
||||
add x16, x9, #8
|
||||
ld1 {v4.16b}, [x9] // cand_b->mv
|
||||
add x15, x8, w15, uxtw #2
|
||||
add x15, x8, w15, uxtw #3
|
||||
ld1 {v1.h}[1], [x16] // cand_b->ref
|
||||
ldrh w12, [x15] // bw8
|
||||
ldr w12, [x15] // bw8
|
||||
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
|
||||
trn1 v2.2d, v0.2d, v4.2d
|
||||
|
||||
|
@ -166,12 +167,12 @@ function save_tmvs_neon, export=1
|
|||
addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
|
||||
umov w16, v1.h[0] // Extract case for first block
|
||||
umov w17, v1.h[1]
|
||||
ldrh w11, [x11, #2] // Fetch jump table entry
|
||||
ldrh w15, [x15, #2]
|
||||
ldrsw x11, [x11, #4] // Fetch jump table entry
|
||||
ldrsw x15, [x15, #4]
|
||||
ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
|
||||
ldr q5, [x13, w17, uxtw #4]
|
||||
sub x11, x8, w11, uxtw // Find jump table target
|
||||
sub x15, x8, w15, uxtw
|
||||
add x11, x8, x11 // Find jump table target
|
||||
add x15, x8, x15
|
||||
tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
|
||||
tbl v4.16b, {v4.16b}, v5.16b
|
||||
|
||||
|
@ -243,50 +244,51 @@ function save_tmvs_neon, export=1
|
|||
str q2, [x3, #(16*5-16)]
|
||||
add x3, x3, #16*5
|
||||
ret
|
||||
|
||||
L(save_tmvs_tbl):
|
||||
.hword 16 * 12
|
||||
.hword L(save_tmvs_tbl) - 160b
|
||||
.hword 16 * 12
|
||||
.hword L(save_tmvs_tbl) - 160b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
endfunc
|
||||
|
||||
jumptable save_tmvs_tbl
|
||||
.word 16 * 12
|
||||
.word 160b - save_tmvs_tbl
|
||||
.word 16 * 12
|
||||
.word 160b - save_tmvs_tbl
|
||||
.word 8 * 12
|
||||
.word 80b - save_tmvs_tbl
|
||||
.word 8 * 12
|
||||
.word 80b - save_tmvs_tbl
|
||||
.word 8 * 12
|
||||
.word 80b - save_tmvs_tbl
|
||||
.word 8 * 12
|
||||
.word 80b - save_tmvs_tbl
|
||||
.word 4 * 12
|
||||
.word 40b - save_tmvs_tbl
|
||||
.word 4 * 12
|
||||
.word 40b - save_tmvs_tbl
|
||||
.word 4 * 12
|
||||
.word 40b - save_tmvs_tbl
|
||||
.word 4 * 12
|
||||
.word 40b - save_tmvs_tbl
|
||||
.word 2 * 12
|
||||
.word 20b - save_tmvs_tbl
|
||||
.word 2 * 12
|
||||
.word 20b - save_tmvs_tbl
|
||||
.word 2 * 12
|
||||
.word 20b - save_tmvs_tbl
|
||||
.word 2 * 12
|
||||
.word 20b - save_tmvs_tbl
|
||||
.word 2 * 12
|
||||
.word 20b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
.word 1 * 12
|
||||
.word 10b - save_tmvs_tbl
|
||||
endjumptable
|
||||
|
|
|
@ -323,6 +323,32 @@ EXTERN\name:
|
|||
\name:
|
||||
.endm
|
||||
|
||||
.macro jumptable name
|
||||
#ifdef _WIN32
|
||||
// MS armasm64 doesn't seem to be able to create relocations for subtraction
|
||||
// of labels in different sections; for armasm64 (and all of Windows for
|
||||
// simplicity), write the jump table in the text section, to allow calculating
|
||||
// differences at assembly time. See
|
||||
// https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340
|
||||
// for reference. (LLVM can create such relocations, but checking for _WIN32
|
||||
// for simplicity, as execute-only memory isn't relevant on Windows at the
|
||||
// moment.)
|
||||
function \name
|
||||
#else
|
||||
// For other platforms, write jump tables in a const data section, to allow
|
||||
// working in environments where executable memory isn't readable.
|
||||
const \name
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro endjumptable
|
||||
#ifdef _WIN32
|
||||
endfunc
|
||||
#else
|
||||
endconst
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define L(x) L ## x
|
||||
#else
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/arm/cpu.h"
|
||||
|
||||
#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
|
||||
|
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
|||
elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
|
||||
#endif
|
||||
|
||||
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
|
||||
flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
|
||||
flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
|
||||
|
@ -75,7 +76,8 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
|||
elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
|
||||
#endif
|
||||
|
||||
unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
|
||||
flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
|
||||
flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
|
||||
return flags;
|
||||
|
@ -95,7 +97,7 @@ static int have_feature(const char *feature) {
|
|||
}
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
if (have_feature("hw.optional.arm.FEAT_DotProd"))
|
||||
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
|
||||
if (have_feature("hw.optional.arm.FEAT_I8MM"))
|
||||
|
@ -104,16 +106,14 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
|||
return flags;
|
||||
}
|
||||
|
||||
#elif defined(__OpenBSD__)
|
||||
|
||||
#if ARCH_AARCH64
|
||||
#elif defined(__OpenBSD__) && ARCH_AARCH64
|
||||
#include <machine/armreg.h>
|
||||
#include <machine/cpu.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
|
||||
#ifdef CPU_ID_AA64ISAR0
|
||||
int mib[2];
|
||||
|
@ -142,25 +142,31 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
|||
|
||||
return flags;
|
||||
}
|
||||
#else /* !ARCH_AARCH64 */
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||
return flags;
|
||||
}
|
||||
#endif /* ARCH_AARCH64 */
|
||||
|
||||
#elif defined(_WIN32)
|
||||
#include <windows.h>
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
|
||||
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
|
||||
flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
|
||||
#endif
|
||||
/* No I8MM or SVE feature detection available on Windows at the time of
|
||||
* writing. */
|
||||
#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
|
||||
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE))
|
||||
flags |= DAV1D_ARM_CPU_FLAG_SVE;
|
||||
#endif
|
||||
#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
|
||||
if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE))
|
||||
flags |= DAV1D_ARM_CPU_FLAG_SVE2;
|
||||
#endif
|
||||
#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
|
||||
/* There's no PF_* flag that indicates whether plain I8MM is available
|
||||
* or not. But if SVE_I8MM is available, that also implies that
|
||||
* regular I8MM is available. */
|
||||
if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE))
|
||||
flags |= DAV1D_ARM_CPU_FLAG_I8MM;
|
||||
#endif
|
||||
return flags;
|
||||
}
|
||||
|
||||
|
@ -206,7 +212,8 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
|
|||
}
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
|
||||
flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
|
||||
flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
|
||||
flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
|
||||
|
@ -220,7 +227,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
|||
#else /* Unsupported OS */
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_arm(void) {
|
||||
return 0;
|
||||
return dav1d_get_default_cpu_flags();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -63,6 +63,7 @@
|
|||
decl_8tap_fns(neon);
|
||||
decl_8tap_fns(neon_dotprod);
|
||||
decl_8tap_fns(neon_i8mm);
|
||||
decl_8tap_fns(sve2);
|
||||
|
||||
decl_mc_fn(BF(dav1d_put_bilin, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_bilin, neon));
|
||||
|
@ -110,17 +111,27 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
|
|||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
|
||||
c->emu_edge = BF(dav1d_emu_edge, neon);
|
||||
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
#if ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
#if HAVE_DOTPROD
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
|
||||
|
||||
init_8tap_fns(neon_dotprod);
|
||||
if (flags & DAV1D_ARM_CPU_FLAG_DOTPROD) {
|
||||
init_8tap_fns(neon_dotprod);
|
||||
}
|
||||
#endif // HAVE_DOTPROD
|
||||
|
||||
#if HAVE_I8MM
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
|
||||
|
||||
init_8tap_fns(neon_i8mm);
|
||||
if (flags & DAV1D_ARM_CPU_FLAG_I8MM) {
|
||||
init_8tap_fns(neon_i8mm);
|
||||
}
|
||||
#endif // HAVE_I8MM
|
||||
#endif // ARCH_AARCH64 && BITDEPTH == 8
|
||||
#endif // BITDEPTH == 8
|
||||
|
||||
#if BITDEPTH == 16
|
||||
#if HAVE_SVE2
|
||||
if (flags & DAV1D_ARM_CPU_FLAG_SVE2) {
|
||||
init_8tap_fns(sve2);
|
||||
}
|
||||
#endif // HAVE_SVE2
|
||||
#endif // BITDEPTH == 16
|
||||
#endif // ARCH_AARCH64
|
||||
}
|
||||
|
|
|
@ -33,20 +33,24 @@
|
|||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#elif defined(__APPLE__)
|
||||
#endif
|
||||
#ifdef __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/types.h>
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_PTHREAD_GETAFFINITY_NP
|
||||
#include <pthread.h>
|
||||
#ifdef HAVE_PTHREAD_NP_H
|
||||
#include <pthread_np.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__)
|
||||
#define cpu_set_t cpuset_t
|
||||
#endif
|
||||
#endif
|
||||
|
||||
unsigned dav1d_cpu_flags = 0U;
|
||||
unsigned dav1d_cpu_flags_mask = ~0U;
|
||||
|
|
|
@ -54,12 +54,9 @@ void dav1d_init_cpu(void);
|
|||
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
|
||||
int dav1d_num_logical_processors(Dav1dContext *c);
|
||||
|
||||
static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
|
||||
unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
|
||||
static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
|
||||
unsigned flags = 0;
|
||||
|
||||
#if TRIM_DSP_FUNCTIONS
|
||||
/* Since this function is inlined, unconditionally setting a flag here will
|
||||
* enable dead code elimination in the calling function. */
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
|
||||
flags |= DAV1D_ARM_CPU_FLAG_NEON;
|
||||
|
@ -119,6 +116,17 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
|
|||
flags |= DAV1D_X86_CPU_FLAG_SSE2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
|
||||
unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
|
||||
|
||||
#if TRIM_DSP_FUNCTIONS
|
||||
/* Since this function is inlined, unconditionally setting a flag here will
|
||||
* enable dead code elimination in the calling function. */
|
||||
flags |= dav1d_get_default_cpu_flags();
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
|
||||
#include "config.h"
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/loongarch/cpu.h"
|
||||
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
|
@ -36,7 +38,7 @@
|
|||
#endif
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
|
||||
unsigned flags = 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
|
||||
|
|
|
@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
|
|||
void *dav1d_alloc_aligned(const enum AllocationType type,
|
||||
const size_t sz, const size_t align)
|
||||
{
|
||||
assert(!(align & (align - 1)));
|
||||
void *ptr;
|
||||
#ifdef _WIN32
|
||||
ptr = _aligned_malloc(sz + align, align);
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
if (posix_memalign(&ptr, align, sz + align)) return NULL;
|
||||
#else
|
||||
ptr = memalign(align, sz + align);
|
||||
#endif
|
||||
|
||||
void *const ptr = dav1d_alloc_aligned_internal(align, sz + align);
|
||||
return track_alloc(type, ptr, sz, align);
|
||||
}
|
||||
|
||||
|
@ -140,12 +131,7 @@ void dav1d_free(void *ptr) {
|
|||
|
||||
void dav1d_free_aligned(void *ptr) {
|
||||
if (ptr) {
|
||||
ptr = track_free(ptr);
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
dav1d_free_aligned_internal(track_free(ptr));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
|
||||
#if defined(_WIN32) || defined(HAVE_MEMALIGN)
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
|
@ -79,6 +79,39 @@ typedef struct Dav1dMemPool {
|
|||
#endif
|
||||
} Dav1dMemPool;
|
||||
|
||||
// TODO: Move this to a common location?
|
||||
#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
|
||||
|
||||
/*
|
||||
* Allocate align-byte aligned memory. The return value can be released
|
||||
* by calling the dav1d_free_aligned() function.
|
||||
*/
|
||||
static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
|
||||
assert(!(align & (align - 1)));
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(sz, align);
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
void *ptr;
|
||||
if (posix_memalign(&ptr, align, sz)) return NULL;
|
||||
return ptr;
|
||||
#elif defined(HAVE_MEMALIGN)
|
||||
return memalign(align, sz);
|
||||
#elif defined(HAVE_ALIGNED_ALLOC)
|
||||
// The C11 standard specifies that the size parameter
|
||||
// must be an integral multiple of alignment.
|
||||
return aligned_alloc(align, ROUND_UP(sz, align));
|
||||
#else
|
||||
#error No aligned allocation functions are available
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dav1d_free_aligned_internal(void *ptr) {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
void *dav1d_malloc(enum AllocationType type, size_t sz);
|
||||
|
@ -91,34 +124,9 @@ void dav1d_log_alloc_stats(Dav1dContext *c);
|
|||
#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
|
||||
#define dav1d_malloc(type, sz) malloc(sz)
|
||||
#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
|
||||
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
|
||||
#define dav1d_free(ptr) free(ptr)
|
||||
|
||||
/*
|
||||
* Allocate align-byte aligned memory. The return value can be released
|
||||
* by calling the dav1d_free_aligned() function.
|
||||
*/
|
||||
static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
|
||||
assert(!(align & (align - 1)));
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(sz, align);
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
void *ptr;
|
||||
if (posix_memalign(&ptr, align, sz)) return NULL;
|
||||
return ptr;
|
||||
#else
|
||||
return memalign(align, sz);
|
||||
#endif
|
||||
}
|
||||
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
|
||||
|
||||
static inline void dav1d_free_aligned(void *ptr) {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
|
||||
#endif /* TRACK_HEAP_ALLOCATIONS */
|
||||
|
||||
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
|
||||
|
|
|
@ -119,6 +119,7 @@ if is_asm_enabled
|
|||
'arm/64/loopfilter16.S',
|
||||
'arm/64/looprestoration16.S',
|
||||
'arm/64/mc16.S',
|
||||
'arm/64/mc16_sve.S',
|
||||
)
|
||||
endif
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
|
@ -370,7 +371,7 @@ libdav1d = library('dav1d',
|
|||
)
|
||||
|
||||
dav1d_dep = declare_dependency(link_with: libdav1d,
|
||||
include_directories : include_directories('../include/dav1d')
|
||||
include_directories : include_directories('../include')
|
||||
)
|
||||
|
||||
#
|
||||
|
|
|
@ -201,16 +201,6 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
|||
(void **) &p->progress);
|
||||
if (res) return res;
|
||||
|
||||
dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
|
||||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
|
||||
&f->tile[0].data.m);
|
||||
|
||||
// Must be removed from the context after being attached to the frame
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = NULL;
|
||||
c->n_itut_t35 = 0;
|
||||
|
||||
// Don't clear these flags from c->frame_flags if the frame is not going to be output.
|
||||
// This way they will be added to the next visible frame too.
|
||||
const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
|
||||
|
@ -221,6 +211,22 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
|||
|
||||
p->visible = f->frame_hdr->show_frame;
|
||||
p->showable = f->frame_hdr->showable_frame;
|
||||
|
||||
if (p->visible) {
|
||||
// Only add HDR10+ and T35 metadata when show frame flag is enabled
|
||||
dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
|
||||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
|
||||
&f->tile[0].data.m);
|
||||
|
||||
// Must be removed from the context after being attached to the frame
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = NULL;
|
||||
c->n_itut_t35 = 0;
|
||||
} else {
|
||||
dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m);
|
||||
}
|
||||
|
||||
if (c->n_fc > 1) {
|
||||
atomic_init(&p->progress[0], 0);
|
||||
atomic_init(&p->progress[1], 0);
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/ppc/cpu.h"
|
||||
|
||||
#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
|
||||
|
@ -37,7 +38,7 @@
|
|||
#endif
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_ppc(void) {
|
||||
unsigned flags = 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
unsigned long hw_cap2 = getauxval(AT_HWCAP2);
|
||||
|
|
|
@ -43,22 +43,26 @@ PACKED(typedef struct refmvs_temporal_block {
|
|||
mv mv;
|
||||
int8_t ref;
|
||||
}) refmvs_temporal_block;
|
||||
CHECK_SIZE(refmvs_temporal_block, 5);
|
||||
|
||||
typedef union refmvs_refpair {
|
||||
PACKED(typedef union refmvs_refpair {
|
||||
int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
|
||||
uint16_t pair;
|
||||
} refmvs_refpair;
|
||||
}) ALIGN(refmvs_refpair, 2);
|
||||
CHECK_SIZE(refmvs_refpair, 2);
|
||||
|
||||
typedef union refmvs_mvpair {
|
||||
mv mv[2];
|
||||
uint64_t n;
|
||||
} refmvs_mvpair;
|
||||
CHECK_SIZE(refmvs_mvpair, 8);
|
||||
|
||||
PACKED(typedef struct refmvs_block {
|
||||
refmvs_mvpair mv;
|
||||
refmvs_refpair ref;
|
||||
uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
|
||||
}) ALIGN(refmvs_block, 4);
|
||||
CHECK_SIZE(refmvs_block, 12);
|
||||
|
||||
typedef struct refmvs_frame {
|
||||
const Dav1dFrameHeader *frm_hdr;
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/riscv/cpu.h"
|
||||
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
|
@ -41,7 +42,7 @@
|
|||
int dav1d_has_compliant_rvv(void);
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
|
||||
unsigned flags = 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;
|
||||
|
|
|
@ -132,6 +132,14 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
|
|||
#else
|
||||
|
||||
#include <pthread.h>
|
||||
#if defined(__FreeBSD__)
|
||||
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
|
||||
#define _SYS_PARAM_H_
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_PTHREAD_NP_H
|
||||
#include <pthread_np.h>
|
||||
#endif
|
||||
|
||||
#define dav1d_init_thread() do {} while (0)
|
||||
|
||||
|
@ -145,31 +153,30 @@ static inline void dav1d_set_thread_name(const char *const name) {
|
|||
prctl(PR_SET_NAME, name);
|
||||
}
|
||||
|
||||
#elif defined(__APPLE__)
|
||||
#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__APPLE__)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_setname_np(name);
|
||||
}
|
||||
|
||||
#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
|
||||
|
||||
#if defined(__FreeBSD__)
|
||||
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
|
||||
#define _SYS_PARAM_H_
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#include <pthread_np.h>
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_set_name_np(pthread_self(), name);
|
||||
}
|
||||
|
||||
#elif defined(__NetBSD__)
|
||||
#elif defined(HAVE_PTHREAD_SETNAME_NP) && defined(__NetBSD__)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_setname_np(pthread_self(), "%s", (void*)name);
|
||||
}
|
||||
|
||||
#elif defined(HAVE_PTHREAD_SETNAME_NP)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_setname_np(pthread_self(), name);
|
||||
}
|
||||
|
||||
#elif defined(HAVE_PTHREAD_SET_NAME_NP)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_set_name_np(pthread_self(), name);
|
||||
}
|
||||
|
||||
#elif defined(__HAIKU__)
|
||||
|
||||
#include <os/kernel/OS.h>
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/x86/cpu.h"
|
||||
|
||||
typedef struct {
|
||||
|
@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
|
|||
};
|
||||
} cpu;
|
||||
dav1d_cpu_cpuid(&cpu.r, 0, 0);
|
||||
unsigned flags = 0;
|
||||
unsigned flags = dav1d_get_default_cpu_flags();
|
||||
|
||||
if (cpu.max_leaf >= 1) {
|
||||
CpuidRegisters r;
|
||||
|
|
Загрузка…
Ссылка в новой задаче