Bug 1540830 - Update dav1d from upstream to 1f7a7e8. r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D28200

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Alex Chronopoulos 2019-04-19 20:36:10 +00:00
Родитель 931da4b767
Коммит d1bd6b015b
38 изменённых файлов: 8762 добавлений и 945 удалений

Просмотреть файл

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z).
release: commit 1f7a7e8a6af739a05b320151d04f0f7509ae7579 (2019-04-19T07:16:39.000Z).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.2.2"
#define DAV1D_VERSION "1f7a7e8a6af739a05b320151d04f0f7509ae7579"

6
third_party/dav1d/.gitlab-ci.yml поставляемый
Просмотреть файл

@ -12,6 +12,12 @@ style-check:
script:
- git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
- git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
- for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
if [ -n "$(tail -c 1 "$i")" ]; then
echo "No newline at end of $i";
exit 1;
fi;
done
- git remote rm upstream 2> /dev/null || true
- git remote add upstream https://code.videolan.org/videolan/dav1d.git
- git fetch -q upstream master

8
third_party/dav1d/NEWS поставляемый
Просмотреть файл

@ -1,6 +1,14 @@
Changes for 0.2.2 'Antelope':
----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
- NEON optimizations for SGR and loop filter
- Minor crashes, improvements and build changes
Changes for 0.2.1 'Antelope':
----------------------------

4
third_party/dav1d/include/dav1d/common.h поставляемый
Просмотреть файл

@ -33,7 +33,11 @@
#ifndef DAV1D_API
#if defined _WIN32
#if defined DAV1D_BUILDING_DLL
#define DAV1D_API __declspec(dllexport)
#else
#define DAV1D_API
#endif
#else
#if __GNUC__ >= 4
#define DAV1D_API __attribute__ ((visibility ("default")))

18
third_party/dav1d/src/arm/32/mc.S поставляемый
Просмотреть файл

@ -217,8 +217,8 @@ bidir_fn mask
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)).
function put
// and assumes that r8 is set to (clz(w)-24).
function put_neon
adr r9, L(put_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@ -307,9 +307,9 @@ endfunc
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
function prep
// assumes that the caller has loaded the h argument into r4,
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
function prep_neon
adr r9, L(prep_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@ -660,7 +660,7 @@ function \op\()_8tap_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
movw r8, \type_h
movw r9, \type_v
b \op\()_8tap
b \op\()_8tap_neon
endfunc
.endm
@ -680,7 +680,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap
function \type\()_8tap_neon
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
@ -699,7 +699,7 @@ function \type\()_8tap
bne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
bne L(\type\()_8tap_v)
b \type
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@ -1831,7 +1831,7 @@ function \type\()_bilin_8bpc_neon, export=1
bne L(\type\()_bilin_h)
cmp \my, #0
bne L(\type\()_bilin_v)
b \type
b \type\()_neon
L(\type\()_bilin_h):
cmp \my, #0

1124
third_party/dav1d/src/arm/64/loopfilter.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1372
third_party/dav1d/src/arm/64/looprestoration.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

34
third_party/dav1d/src/arm/64/mc.S поставляемый
Просмотреть файл

@ -235,8 +235,8 @@ bidir_fn mask
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (24-clz(w)).
function put
// and assumes that x8 is set to (clz(w)-24).
function put_neon
adr x9, L(put_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@ -330,8 +330,8 @@ endfunc
// This has got the same signature as the prep_8tap functions,
// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
function prep
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
function prep_neon
adr x9, L(prep_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@ -703,7 +703,7 @@ endfunc
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
b \op\()_8tap
b \op\()_8tap\()_neon
endfunc
.endm
@ -723,7 +723,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap
function \type\()_8tap_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@ -741,7 +741,7 @@ function \type\()_8tap
b.ne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
b.ne L(\type\()_8tap_v)
b \type
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@ -1826,7 +1826,7 @@ function \type\()_bilin_8bpc_neon, export=1
sub w8, w8, #24
cbnz \mx, L(\type\()_bilin_h)
cbnz \my, L(\type\()_bilin_v)
b \type
b \type\()_neon
L(\type\()_bilin_h):
cbnz \my, L(\type\()_bilin_hv)
@ -2335,7 +2335,7 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
add \src, \src, \inc
.endm
function warp_filter_horz
function warp_filter_horz_neon
add w12, w5, #512
ld1 {v16.8b, v17.8b}, [x2], x3
@ -2431,24 +2431,24 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
lsl x1, x1, #1
.endif
bl warp_filter_horz
bl warp_filter_horz_neon
mov v24.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v25.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v26.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v27.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v28.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v29.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v30.16b, v16.16b
1:
add w14, w6, #512
bl warp_filter_horz
bl warp_filter_horz_neon
mov v31.16b, v16.16b
load_filter_row d0, w14, w9

41
third_party/dav1d/src/arm/64/util.S поставляемый
Просмотреть файл

@ -88,4 +88,45 @@
trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().16b, \r0\().16b, \r1\().16b
trn2 \r9\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \r9\().8h, \r3\().8h
trn2 \r9\().8h, \r9\().8h, \r3\().8h
trn1 \r3\().8h, \r8\().8h, \r1\().8h
trn2 \r8\().8h, \r8\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \r8\().4s, \r2\().4s
trn1 \r2\().4s, \r8\().4s, \r2\().4s
trn1 \r3\().4s, \r9\().4s, \r7\().4s
trn2 \r7\().4s, \r9\().4s, \r7\().4s
.endm
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
#endif /* DAV1D_SRC_ARM_64_UTIL_S */

47
third_party/dav1d/src/arm/loopfilter_init_tmpl.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,47 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
#endif
}

Просмотреть файл

@ -29,6 +29,7 @@
#include "src/looprestoration.h"
#include "common/attributes.h"
#include "src/tables.h"
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
@ -91,7 +92,171 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
}
}
#endif
#if ARCH_AARCH64
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter1_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 3x3 box (radius=1) */
static void dav1d_sgr_filter1_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 1, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter2_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 5x5 box (radius=2) */
static void dav1d_sgr_filter2_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const int w, const int h,
const int wt);
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int16_t wt[2]);
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, sgr_wt[0]);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else {
ALIGN_STK_16(coef, tmp1, 64 * 384,);
ALIGN_STK_16(coef, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
if (w >= 8)
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w & ~7, h, wt);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
}
}
#endif // ARCH_AARCH64
#endif // BITDEPTH == 8
void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@ -100,5 +265,8 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *
#if BITDEPTH == 8
c->wiener = wiener_filter_neon;
#if ARCH_AARCH64
c->selfguided = sgr_filter_neon;
#endif
#endif
}

2
third_party/dav1d/src/cdf.c поставляемый
Просмотреть файл

@ -813,7 +813,7 @@ static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = {
AOM_CDF4(4096, 11264, 19328)
};
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
{
{ AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
24189, 28165, 29093, 30466) },

20
third_party/dav1d/src/cdf.h поставляемый
Просмотреть файл

@ -34,11 +34,13 @@
#include "src/ref.h"
#include "src/thread_data.h"
/* Buffers padded to [8] or [16] for SIMD where needed. */
typedef struct CdfModeContext {
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
uint16_t use_filter_intra[N_BS_SIZES][2];
uint16_t filter_intra[5 + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
uint16_t angle_delta[8][8];
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
uint16_t newmv_mode[6][2];
@ -66,7 +68,7 @@ typedef struct CdfModeContext {
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
uint16_t skip[3][2];
uint16_t skip_mode[3][2];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
uint16_t seg_pred[3][2];
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
uint16_t cfl_sign[8 + 1];
@ -88,12 +90,12 @@ typedef struct CdfModeContext {
typedef struct CdfCoefContext {
uint16_t skip[N_TX_SIZES][13][2];
uint16_t eob_bin_16[2][2][6];
uint16_t eob_bin_32[2][2][7];
uint16_t eob_bin_32[2][2][7 + 1];
uint16_t eob_bin_64[2][2][8];
uint16_t eob_bin_128[2][2][9];
uint16_t eob_bin_256[2][2][10];
uint16_t eob_bin_512[2][2][11];
uint16_t eob_bin_1024[2][2][12];
uint16_t eob_bin_256[2][2][10 + 6];
uint16_t eob_bin_512[2][2][11 + 5];
uint16_t eob_bin_1024[2][2][12 + 4];
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
uint16_t base_tok[N_TX_SIZES][2][41][5];
@ -102,7 +104,7 @@ typedef struct CdfCoefContext {
} CdfCoefContext;
typedef struct CdfMvComponent {
uint16_t classes[11 + 1];
uint16_t classes[11 + 1 + 4];
uint16_t class0[2];
uint16_t classN[10][2];
uint16_t class0_fp[2][4 + 1];
@ -119,7 +121,7 @@ typedef struct CdfMvContext {
typedef struct CdfContext {
CdfModeContext m;
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
CdfCoefContext coef;
CdfMvContext mv, dmv;
} CdfContext;

60
third_party/dav1d/src/decode.c поставляемый
Просмотреть файл

@ -80,14 +80,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
const Dav1dFrameContext *const f = t->f;
const int have_hp = f->frame_hdr->hp;
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
mv_comp->classes, 11);
int up, fp, hp;
if (!cl) {
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->class0_fp[up], 4);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->class0_hp) : 1;
@ -101,7 +101,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
up |= dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->classN[n]) << n;
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->classN_fp, 4);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->classN_hp) : 1;
@ -119,7 +119,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
CdfMvContext *const mv_cdf, const int have_fp)
{
switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
N_MV_JOINTS))
{
case MV_JOINT_HV:
@ -379,7 +379,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
uint16_t cache[16], used_cache[8];
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
@ -595,7 +595,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
const int last = imax(0, i - h4 * 4 + 1);
order_palette(pal_idx, stride, i, first, last, order, ctx);
for (int j = first, m = 0; j >= last; j--, m++) {
const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
color_map_cdf[ctx[m]], b->pal_sz[pl]);
pal_idx[(i - j) * stride + j] = order[m][color_idx];
}
@ -811,7 +811,7 @@ static int decode_b(Dav1dTileContext *const t,
const unsigned pred_seg_id =
get_cur_frame_segid(t->by, t->bx, have_top, have_left,
&seg_ctx, f->cur_segmap, f->b4_stride);
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
const unsigned last_active_seg_id =
@ -883,7 +883,7 @@ static int decode_b(Dav1dTileContext *const t,
if (b->skip) {
b->seg_id = pred_seg_id;
} else {
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
const unsigned last_active_seg_id =
@ -932,7 +932,7 @@ static int decode_b(Dav1dTileContext *const t,
memcpy(prev_delta_lf, ts->last_delta_lf, 4);
if (have_delta_q) {
int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_q, 4);
if (delta_q == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
@ -953,7 +953,7 @@ static int decode_b(Dav1dTileContext *const t,
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
for (int i = 0; i < n_lfs; i++) {
int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
if (delta_lf == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
@ -1018,7 +1018,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
[dav1d_intra_mode_context[t->l.mode[by4]]];
b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
N_INTRA_PRED_MODES);
if (DEBUG_BLOCK_INFO)
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
@ -1028,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t,
b->y_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
b->y_angle = angle - 3;
} else {
b->y_angle = 0;
@ -1038,20 +1038,20 @@ static int decode_b(Dav1dTileContext *const t,
const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
N_UV_INTRA_PRED_MODES - !cfl_allowed);
if (DEBUG_BLOCK_INFO)
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
if (b->uv_mode == CFL_PRED) {
#define SIGN(a) (!!(a) + ((a) > 0))
const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.cfl_sign, 8) + 1;
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
assert(sign_u == sign / 3);
if (sign_u) {
const int ctx = (sign_u == 2) * 3 + sign_v;
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
} else {
@ -1059,7 +1059,7 @@ static int decode_b(Dav1dTileContext *const t,
}
if (sign_v) {
const int ctx = (sign_v == 2) * 3 + sign_u;
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
} else {
@ -1073,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t,
b->uv_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
b->uv_angle = angle - 3;
} else {
b->uv_angle = 0;
@ -1113,7 +1113,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.m.use_filter_intra[bs]);
if (is_filter) {
b->y_mode = FILTER_PRED;
b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter_intra, 5);
}
if (DEBUG_BLOCK_INFO)
@ -1156,7 +1156,7 @@ static int decode_b(Dav1dTileContext *const t,
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
imin(t_dim->max + 1, 3));
while (depth--) {
@ -1474,7 +1474,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->tiling.col_end, ts->tiling.row_start,
ts->tiling.row_end, f->libaom_cm);
b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.comp_inter_mode[ctx],
N_COMP_INTER_PRED_MODES);
if (DEBUG_BLOCK_INFO)
@ -1583,7 +1583,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.wedge_comp[ctx]);
if (b->comp_type == COMP_INTER_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[ctx], 16);
} else {
b->comp_type = COMP_INTER_SEG;
@ -1737,7 +1737,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.interintra[ii_sz_grp]))
{
b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.interintra_mode[ii_sz_grp],
N_INTER_INTRA_PRED_MODES);
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
@ -1745,7 +1745,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.interintra_wedge[wedge_ctx]);
if (b->interintra_type == INTER_INTRA_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[wedge_ctx], 16);
} else {
b->interintra_type = INTER_INTRA_NONE;
@ -1778,7 +1778,7 @@ static int decode_b(Dav1dTileContext *const t,
f->frame_hdr->warp_motion && (mask[0] | mask[1]);
b->motion_mode = allow_warp ?
dav1d_msac_decode_symbol_adapt(&ts->msac,
dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.motion_mode[bs], 3) :
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
if (b->motion_mode == MM_WARP) {
@ -1817,7 +1817,7 @@ static int decode_b(Dav1dTileContext *const t,
const int comp = b->comp_type != COMP_INTER_NONE;
const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
by4, bx4);
filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[0][ctx1],
DAV1D_N_SWITCHABLE_FILTERS);
if (f->seq_hdr->dual_filter) {
@ -1826,7 +1826,7 @@ static int decode_b(Dav1dTileContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
filter[0], ctx1, ts->msac.rng);
filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[1][ctx2],
DAV1D_N_SWITCHABLE_FILTERS);
if (DEBUG_BLOCK_INFO)
@ -2021,7 +2021,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
} else {
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
(bp == PARTITION_V || bp == PARTITION_V4 ||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
@ -2365,7 +2365,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
Dav1dTileState *const ts = t->ts;
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.restore_switchable, 3);
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
DAV1D_RESTORATION_WIENER :
@ -2692,7 +2692,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
freep(&f->lf.level);
freep(&f->frame_thread.b);
f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
// over-allocate by 3 bytes since some of the SIMD implementations
// index this from the level type and can thus over-read by up to 3
f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *
sizeof(*f->lf.level));
if (!f->lf.mask || !f->lf.level) goto error;
if (c->n_fc > 1) {

103
third_party/dav1d/src/itx_tmpl.c поставляемый
Просмотреть файл

@ -45,7 +45,7 @@ typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
static void NOINLINE
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob,
const int w, const int h, const int shift1, const int shift2,
const int w, const int h, const int shift,
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
const int has_dconly HIGHBD_DECL_SUFFIX)
{
@ -53,8 +53,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
const int is_rect2 = w * 2 == h || h * 2 == w;
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int rnd1 = (1 << shift1) >> 1;
const int rnd2 = (1 << shift2) >> 1;
const int rnd = (1 << shift) >> 1;
if (has_dconly && eob == 0) {
int dc = coeff[0];
@ -62,9 +61,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
if (is_rect2)
dc = (dc * 2896 + 2048) >> 12;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc + rnd1) >> shift1;
dc = (dc + rnd) >> shift;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc + rnd2) >> shift2;
dc = (dc + 8) >> 4;
for (j = 0; j < h; j++)
for (i = 0; i < w; i++)
dst[i + j * PXSTRIDE(stride)] =
@ -93,9 +92,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
}
for (j = 0; j < w; j++)
#if BITDEPTH == 8
tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
#else
tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
-col_clip_max - 1, col_clip_max);
#endif
}
@ -106,12 +105,12 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
for (j = 0; j < h; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
((out[j] + (rnd2)) >> shift2));
((out[j] + 8) >> 4));
}
memset(coeff, 0, sizeof(*coeff) * sh * sw);
}
#define inv_txfm_fn(type1, type2, w, h, shift1, shift2, has_dconly) \
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
static void \
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const ptrdiff_t stride, \
@ -119,57 +118,57 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const int eob \
HIGHBD_DECL_SUFFIX) \
{ \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
HIGHBD_TAIL_SUFFIX); \
}
#define inv_txfm_fn64(w, h, shift1, shift2) \
inv_txfm_fn(dct, dct, w, h, shift1, shift2, 1)
#define inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(dct, dct, w, h, shift, 1)
#define inv_txfm_fn32(w, h, shift1, shift2) \
inv_txfm_fn64(w, h, shift1, shift2) \
inv_txfm_fn(identity, identity, w, h, shift1, shift2, 0)
#define inv_txfm_fn32(w, h, shift) \
inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(identity, identity, w, h, shift, 0)
#define inv_txfm_fn16(w, h, shift1, shift2) \
inv_txfm_fn32(w, h, shift1, shift2) \
inv_txfm_fn(adst, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(identity, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, identity, w, h, shift1, shift2, 0) \
#define inv_txfm_fn16(w, h, shift) \
inv_txfm_fn32(w, h, shift) \
inv_txfm_fn(adst, dct, w, h, shift, 0) \
inv_txfm_fn(dct, adst, w, h, shift, 0) \
inv_txfm_fn(adst, adst, w, h, shift, 0) \
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
inv_txfm_fn(identity, dct, w, h, shift, 0) \
inv_txfm_fn(dct, identity, w, h, shift, 0) \
#define inv_txfm_fn84(w, h, shift1, shift2) \
inv_txfm_fn16(w, h, shift1, shift2) \
inv_txfm_fn(identity, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, identity, w, h, shift1, shift2, 0) \
inv_txfm_fn(identity, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, identity, w, h, shift1, shift2, 0) \
#define inv_txfm_fn84(w, h, shift) \
inv_txfm_fn16(w, h, shift) \
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
inv_txfm_fn(identity, adst, w, h, shift, 0) \
inv_txfm_fn(adst, identity, w, h, shift, 0) \
inv_txfm_fn84( 4, 4, 0, 4)
inv_txfm_fn84( 4, 8, 0, 4)
inv_txfm_fn84( 4, 16, 1, 4)
inv_txfm_fn84( 8, 4, 0, 4)
inv_txfm_fn84( 8, 8, 1, 4)
inv_txfm_fn84( 8, 16, 1, 4)
inv_txfm_fn32( 8, 32, 2, 4)
inv_txfm_fn84(16, 4, 1, 4)
inv_txfm_fn84(16, 8, 1, 4)
inv_txfm_fn16(16, 16, 2, 4)
inv_txfm_fn32(16, 32, 1, 4)
inv_txfm_fn64(16, 64, 2, 4)
inv_txfm_fn32(32, 8, 2, 4)
inv_txfm_fn32(32, 16, 1, 4)
inv_txfm_fn32(32, 32, 2, 4)
inv_txfm_fn64(32, 64, 1, 4)
inv_txfm_fn64(64, 16, 2, 4)
inv_txfm_fn64(64, 32, 1, 4)
inv_txfm_fn64(64, 64, 2, 4)
inv_txfm_fn84( 4, 4, 0)
inv_txfm_fn84( 4, 8, 0)
inv_txfm_fn84( 4, 16, 1)
inv_txfm_fn84( 8, 4, 0)
inv_txfm_fn84( 8, 8, 1)
inv_txfm_fn84( 8, 16, 1)
inv_txfm_fn32( 8, 32, 2)
inv_txfm_fn84(16, 4, 1)
inv_txfm_fn84(16, 8, 1)
inv_txfm_fn16(16, 16, 2)
inv_txfm_fn32(16, 32, 1)
inv_txfm_fn64(16, 64, 2)
inv_txfm_fn32(32, 8, 2)
inv_txfm_fn32(32, 16, 1)
inv_txfm_fn32(32, 32, 2)
inv_txfm_fn64(32, 64, 1)
inv_txfm_fn64(64, 16, 2)
inv_txfm_fn64(64, 32, 1)
inv_txfm_fn64(64, 64, 2)
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob

1
third_party/dav1d/src/loopfilter.h поставляемый
Просмотреть файл

@ -53,6 +53,7 @@ typedef struct Dav1dLoopFilterDSPContext {
} Dav1dLoopFilterDSPContext;
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
#endif /* DAV1D_SRC_LOOPFILTER_H */

6
third_party/dav1d/src/loopfilter_tmpl.c поставляемый
Просмотреть файл

@ -250,7 +250,11 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_loop_filter_dsp_init_arm)(c);
#elif ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
#endif
#endif
}

31
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -86,12 +86,14 @@ if is_asm_enabled
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
'arm/looprestoration_init_tmpl.c',
'arm/mc_init_tmpl.c',
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
)
@ -118,19 +120,30 @@ if is_asm_enabled
# NASM source files
libdav1d_sources_asm = files(
'x86/cdef.asm',
'x86/cdef_ssse3.asm',
'x86/cpuid.asm',
'x86/msac.asm',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef.asm',
'x86/cdef_sse.asm',
'x86/ipred.asm',
'x86/ipred_ssse3.asm',
'x86/itx.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/looprestoration_ssse3.asm',
'x86/mc.asm',
'x86/mc_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/ipred_ssse3.asm',
)
endif
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
)
endif
# Compile the ASM sources with NASM
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
@ -139,8 +152,10 @@ endif
api_export_flags = []
#
# Windows .rc file
# Windows .rc file and API export flags
#
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
@ -162,6 +177,8 @@ if host_machine.system() == 'windows' and get_option('default_library') != 'stat
)
libdav1d_rc_obj = winmod.compile_resources(rc_file)
api_export_flags = ['-DDAV1D_BUILDING_DLL']
else
libdav1d_rc_obj = []
endif
@ -180,7 +197,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
include_directories : dav1d_inc_dirs,
dependencies: [stdatomic_dependency],
c_args : [stackalign_flag, stackrealign_flag],
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
install : false,
build_by_default : false,
).extract_all_objects()
@ -222,7 +239,7 @@ libdav1d = library('dav1d',
thread_dependency,
thread_compat_dep,
],
c_args : [stackalign_flag],
c_args : [stackalign_flag, api_export_flags],
version : dav1d_soname_version,
soversion : dav1d_soversion,
install : true,

76
third_party/dav1d/src/msac.c поставляемый
Просмотреть файл

@ -58,8 +58,8 @@ static inline void ctx_refill(MsacContext *s) {
* necessary), and stores them back in the decoder context.
* dif: The new value of dif.
* rng: The new value of the range. */
static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
const uint16_t d = 15 - (31 ^ clz(rng));
static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
const int d = 15 ^ (31 ^ clz(rng));
assert(rng <= 65535U);
s->cnt -= d;
s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
@ -69,18 +69,17 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
}
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
// replace the multiply with a simple shift.
v = ((r >> 8) << 7) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
@ -88,59 +87,57 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
* f: The probability that the bit is one
* Return: The value decoded (0 or 1). */
unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
int v = 0;
for (int n = (int) l - 1; n >= 0; n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(c);
unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
unsigned v = 0;
while (n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(s);
return v;
}
int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
const int n, const unsigned k)
{
int i = 0;
int a = 0;
int b = k;
while ((2 << b) < n) {
if (!dav1d_msac_decode_bool_equi(c)) break;
if (!dav1d_msac_decode_bool_equi(s)) break;
b = k + i++;
a = (1 << b);
}
const unsigned v = dav1d_msac_decode_bools(c, b) + a;
const unsigned v = dav1d_msac_decode_bools(s, b) + a;
return ref * 2 <= n ? inv_recenter(ref, v) :
n - 1 - inv_recenter(n - 1 - ref, v);
}
int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
assert(n > 0);
const int l = ulog2(n) + 1;
assert(l > 1);
const unsigned m = (1 << l) - n;
const unsigned v = dav1d_msac_decode_bools(c, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
const unsigned v = dav1d_msac_decode_bools(s, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
}
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
const unsigned n_symbols)
const size_t n_symbols)
{
ec_win u, v = s->rng, r = s->rng >> 8;
const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
unsigned ret = 0;
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
assert(!cdf[n_symbols - 1]);
@ -148,18 +145,21 @@ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
u = v;
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
v >>= 7 - EC_PROB_SHIFT;
v += EC_MIN_PROB * (n_symbols - ret);
v += EC_MIN_PROB * (int) (n_symbols - ret);
} while (c < v);
assert(u <= s->rng);
ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
return ret - 1;
}
static void update_cdf(uint16_t *const cdf, const unsigned val,
const unsigned n_symbols)
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
uint16_t *const cdf,
const size_t n_symbols)
{
const unsigned val = decode_symbol(s, cdf, n_symbols);
if (s->allow_update_cdf) {
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
unsigned i;
@ -168,24 +168,16 @@ static void update_cdf(uint16_t *const cdf, const unsigned val,
for (; i < n_symbols - 1; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
uint16_t *const cdf,
const unsigned n_symbols)
{
const unsigned val = decode_symbol(c, cdf, n_symbols);
if(c->allow_update_cdf)
update_cdf(cdf, val, n_symbols);
}
return val;
}
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
uint16_t *const cdf)
{
const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
if(c->allow_update_cdf){
if (s->allow_update_cdf) {
// update_cdf() specialized for boolean CDFs
const unsigned count = cdf[1];
const int rate = (count >> 4) | 4;

33
third_party/dav1d/src/msac.h поставляемый
Просмотреть файл

@ -38,20 +38,37 @@ typedef struct MsacContext {
const uint8_t *buf_pos;
const uint8_t *buf_end;
ec_win dif;
uint16_t rng;
unsigned rng;
int cnt;
int allow_update_cdf;
} MsacContext;
void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
int disable_cdf_update_flag);
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
const unsigned n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
#if ARCH_X86_64 && HAVE_ASM
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#else
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
#endif
#endif /* DAV1D_SRC_MSAC_H */

26
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -107,7 +107,9 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const txtp_cdf = intra ?
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
if (dbg)
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
@ -122,19 +124,19 @@ static int decode_coefs(Dav1dTileContext *const t,
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
const int is_1d = tx_class != TX_CLASS_2D;
switch (tx2dszctx) {
#define case_sz(sz, bin) \
#define case_sz(sz, bin, ns) \
case sz: { \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
break; \
}
case_sz(0, 16);
case_sz(1, 32);
case_sz(2, 64);
case_sz(3, 128);
case_sz(4, 256);
case_sz(5, 512);
case_sz(6, 1024);
case_sz(0, 16, 4);
case_sz(1, 32, 8);
case_sz(2, 64, 8);
case_sz(3, 128, 8);
case_sz(4, 256, 16);
case_sz(5, 512, 16);
case_sz(6, 1024, 16);
#undef case_sz
}
if (dbg)
@ -179,7 +181,7 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const lo_cdf = is_last ?
ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
4 - is_last) + is_last;
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
@ -190,7 +192,7 @@ static int decode_coefs(Dav1dTileContext *const t,
if (tok == 3) {
const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",

98
third_party/dav1d/src/x86/cdef.asm поставляемый
Просмотреть файл

@ -113,7 +113,7 @@ SECTION .text
paddw m15, m5
%endmacro
%macro cdef_filter_fn 3 ; w, h, stride
%macro CDEF_FILTER 3 ; w, h, stride
INIT_YMM avx2
%if %1 != 4 || %2 != 8
cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
lea dst4q, [dstq+strideq*4]
%endif
lea stride3q, [strideq*3]
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .no_right
pmovzxbw m1, [dstq+strideq*0]
pmovzxbw m2, [dstq+strideq*1]
@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; top
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
test edged, 4 ; have_top
test edgeb, 4 ; have_top
jz .no_top
mov top1q, [top2q+0*gprsize]
mov top2q, [top2q+1*gprsize]
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .top_no_left
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .top_no_right
pmovzxbw m1, [top1q-(%1/2)]
pmovzxbw m2, [top2q-(%1/2)]
@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px-1*%3+%1*2], xm14
jmp .top_done
.top_no_left:
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .top_no_left_right
pmovzxbw m1, [top1q]
pmovzxbw m2, [top2q]
@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
.top_done:
; left
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .no_left
pmovzxbw xm1, [leftq+ 0]
%if %2 == 8
@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; bottom
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
test edged, 8 ; have_bottom
test edgeb, 8 ; have_bottom
jz .no_bottom
lea dst8q, [dstq+%2*strideq]
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .bottom_no_left
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .bottom_no_right
pmovzxbw m1, [dst8q-(%1/2)]
pmovzxbw m2, [dst8q+strideq-(%1/2)]
@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px+(%2+1)*%3+%1*2], xm14
jmp .bottom_done
.bottom_no_left:
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .bottom_no_left_right
pmovzxbw m1, [dst8q]
pmovzxbw m2, [dst8q+strideq]
@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; actual filter
INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
%undef edged
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
movifnidn prid, prim
movifnidn secd, secm
mov dampingd, r7m
mov pridmpd, prid
mov secdmpd, secd
or pridmpd, 1
or secdmpd, 1
lzcnt pridmpd, pridmpd
lzcnt secdmpd, secdmpd
lea pridmpd, [pridmpd+dampingd-31]
lea secdmpd, [secdmpd+dampingd-31]
xor dampingd, dampingd
test pridmpd, pridmpd
cmovl pridmpd, dampingd
test secdmpd, secdmpd
cmovl secdmpd, dampingd
lzcnt pridmpd, prid
%if UNIX64
movd xm0, prid
movd xm1, secdmpd
%endif
lzcnt secdmpd, secdmpm
sub dampingd, 31
xor zerod, zerod
add pridmpd, dampingd
cmovl pridmpd, zerod
add secdmpd, dampingd
cmovl secdmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
movd xm0, prid
movd xm1, secd
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
%if UNIX64
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
%else
vpbroadcastb m0, prim
vpbroadcastb m1, secm
%endif
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
mov dird, r6m
lea dirq, [tapq+dirq*2+14]
lea dirq, [tableq+dirq*2+14]
%if %1*%2*2/mmsize > 1
%if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
@ -476,9 +475,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
RET
%endmacro
cdef_filter_fn 8, 8, 32
cdef_filter_fn 4, 8, 32
cdef_filter_fn 4, 4, 32
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
INIT_YMM avx2
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left
pshuflw m11, m11, q3012
punpckhwd m6, m4, m11
punpcklwd m4, m11
pshuflw m5, m11, q3012
punpckhwd m6, m11, m4
punpcklwd m4, m5
pmaddwd m6, m6
pmaddwd m4, m4
pmulld m6, m12
@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left
pshuflw m1, m1, q3012
punpckhwd m6, m5, m1
punpcklwd m5, m1
pmaddwd m6, m6
pshuflw m0, m1, q3012
punpckhwd m1, m5
punpcklwd m5, m0
pmaddwd m1, m1
pmaddwd m5, m5
pmulld m6, m12
pmulld m1, m12
pmulld m5, m13
paddd m5, m6 ; cost1[a-d] | cost3[a-d]
paddd m5, m1 ; cost1[a-d] | cost3[a-d]
mova xm0, [pd_47130256+ 16]
mova m1, [pd_47130256]
@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; now find the best cost
pmaxsd xm2, xm0, xm1
pshufd xm3, xm2, q3232
pshufd xm3, xm2, q1032
pmaxsd xm2, xm3
pshufd xm3, xm2, q1111
pmaxsd xm2, xm3
pshufd xm2, xm2, q0000 ; best cost
pshufd xm3, xm2, q2301
pmaxsd xm2, xm3 ; best cost
; find the idx using minpos
; make everything other than the best cost negative via subtraction
@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
phminposuw xm3, xm3
; convert idx to 32-bits
psrldq xm3, 2
psrld xm3, 16
movd eax, xm3
; get idx^4 complement

15
third_party/dav1d/src/x86/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -29,15 +29,19 @@
#include "src/cdef.h"
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
@ -45,13 +49,22 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH ==8
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64

Просмотреть файл

@ -31,16 +31,26 @@ SECTION_RODATA 16
%if ARCH_X86_32
pb_0: times 16 db 0
pb_0xFF: times 16 db 0xFF
%endif
pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_2048: times 8 dw 2048
%if ARCH_X86_32
pw_0x7FFF: times 8 dw 0x7FFF
pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
pw_0x8000: times 8 dw 0x8000
%endif
div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105, 105, 105, 105, 105
div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: dw 4, 2, 3, 3, 2, 1
tap_table: ; masks for 8-bit shift emulation
db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
; weights
db 4, 2, 3, 3, 2, 1
; taps indices
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
db 0 * 16 + 1, 0 * 16 + 2
@ -59,8 +69,6 @@ tap_table: dw 4, 2, 3, 3, 2, 1
SECTION .text
INIT_XMM ssse3
%macro movif32 2
%if ARCH_X86_32
mov %1, %2
@ -111,23 +119,32 @@ INIT_XMM ssse3
%endif
%endmacro
%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride
%if ARCH_X86_64
%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
; load p0/p1
movsx offq, byte [dirq+kq+%1] ; off1
%if %5 == 4
movq m5, [stkq+offq*2+%6*0] ; p0
movhps m5, [stkq+offq*2+%6*1]
%if %6 == 4
movq m5, [stkq+offq*2+%7*0] ; p0
movhps m5, [stkq+offq*2+%7*1]
%else
movu m5, [stkq+offq*2+%6*0] ; p0
movu m5, [stkq+offq*2+%7*0] ; p0
%endif
neg offq ; -off1
%if %5 == 4
movq m6, [stkq+offq*2+%6*0] ; p1
movhps m6, [stkq+offq*2+%6*1]
%if %6 == 4
movq m6, [stkq+offq*2+%7*0] ; p1
movhps m6, [stkq+offq*2+%7*1]
%else
movu m6, [stkq+offq*2+%6*0] ; p1
movu m6, [stkq+offq*2+%7*0] ; p1
%endif
%if cpuflag(sse4)
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
pmaxsw m7, m5
pminuw m8, m5
pmaxsw m7, m6
pminuw m8, m6
%else
%if ARCH_X86_64
pcmpeqw m9, m14, m5
pcmpeqw m10, m14, m6
pandn m9, m5
@ -136,77 +153,42 @@ INIT_XMM ssse3
pminsw m8, m5 ; min after p0
pmaxsw m7, m10 ; max after p1
pminsw m8, m6 ; min after p1
%else
pcmpeqw m3, m5, OUT_OF_BOUNDS_MEM
pandn m3, m5
pmaxsw m7, m3 ; max after p0
pminsw m8, m5 ; min after p0
pcmpeqw m3, m6, OUT_OF_BOUNDS_MEM
pandn m3, m6
pmaxsw m7, m3 ; max after p1
pminsw m8, m6 ; min after p1
%endif
%endif
; accumulate sum[m13] over p0/p1
psubw m5, m4 ; diff_p0(p0 - px)
psubw m6, m4 ; diff_p1(p1 - px)
pabsw m9, m5
pabsw m10, m6
mova m12, m9
psrlw m9, %2
psignw m11, %4, m5
psubusw m5, %3, m9
mova m9, m10
pminsw m5, m12 ; constrain(diff_p0)
psrlw m10, %2
psignw m12, %4, m6
psubusw m6, %3, m10
pmullw m5, m11 ; constrain(diff_p0) * taps
pminsw m6, m9 ; constrain(diff_p1)
pmullw m6, m12 ; constrain(diff_p1) * taps
paddw m13, m5
paddw m13, m6
packsswb m5, m6 ; convert pixel diff to 8-bit
%if ARCH_X86_64 && cpuflag(sse4)
pshufb m5, m14 ; group diffs p0 and p1 into pairs
%else
; load p0
movsx offq, byte [dirq+kq+%1] ; off1
%if %5 == 4
movq m5, [stkq+offq*2+%6*0] ; p0
movhps m5, [stkq+offq*2+%6*1]
pshufb m5, [PIC_sym(shufb_lohi)]
%endif
pabsb m6, m5
psignb m9, %5, m5
%if ARCH_X86_64
psrlw m10, m6, %2 ; emulate 8-bit shift
pand m10, %3
psubusb m5, %4, m10
%else
movu m5, [stkq+offq*2+%6*0] ; p0
psrlw m5, m6, %2 ; emulate 8-bit shift
pand m5, %3
paddusb m5, %4
pxor m5, [PIC_sym(pb_0xFF)]
%endif
pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)]
pandn m3, m5
pmaxsw m7, m3 ; max after p0
pminsw m8, m5 ; min after p0
; accumulate sum[m7] over p0
psubw m5, m4 ; diff_p0(p0 - px)
psignw m6, %4, m5 ; constrain(diff_p0)
pabsw m5, m5
mova m3, m5
psrlw m5, %2
paddsw m5, %3
pandn m5, [PIC_sym(pw_0x7FFF)]
pminsw m5, m3
pmullw m5, m6 ; constrain(diff_p0) * taps
pminub m5, m6 ; constrain(diff_p)
pmaddubsw m5, m9 ; constrain(diff_p) * taps
paddw m13, m5
; load p1
neg offq ; -off1
%if %5 == 4
movq m5, [stkq+offq*2+%6*0] ; p1
movhps m5, [stkq+offq*2+%6*1]
%else
movu m5, [stkq+offq*2+%6*0] ; p1
%endif
pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)]
pandn m3, m5
pmaxsw m7, m3 ; max after p1
pminsw m8, m5 ; min after p1
; accumulate sum[m7] over p1
psubw m5, m4 ; diff_p1(p1 - px)
psignw m6, %4, m5 ; constrain(diff_p1)
pabsw m5, m5
mova m3, m5
psrlw m5, %2
paddsw m5, %3
pandn m5, [PIC_sym(pw_0x7FFF)]
pminsw m5, m3
pmullw m5, m6 ; constrain(diff_p1) * taps
paddw m13, m5
%endif
%endmacro
%macro PMOVZXBW 2-3 0 ; %3 = half
@ -250,17 +232,28 @@ INIT_XMM ssse3
%endif
%endmacro
%macro cdef_filter_fn 3 ; w, h, stride
%macro CDEF_FILTER 3 ; w, h, stride
%if cpuflag(sse4)
%define OUT_OF_BOUNDS 0x80008000
%else
%define OUT_OF_BOUNDS 0x7FFF7FFF
%endif
%if ARCH_X86_64
cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
dst, stride, left, top, pri, sec, stride3, dst4, edge
pcmpeqw m14, m14
%if cpuflag(sse4)
psllw m14, 15 ; 0x8000
%else
psrlw m14, 1 ; 0x7FFF
%endif
pxor m15, m15
%define px rsp+3*16+2*%3
%else
cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
dst, stride, left, top, stride3, dst4, edge
SAVE_ARG left, 2
SAVE_ARG top, 3
@ -272,9 +265,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
%define PIC_reg r2
LEA PIC_reg, PIC_base_offset
%if cpuflag(sse4)
%define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
%else
%define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
%endif
%define m15 [PIC_sym(pb_0)]
%define px esp+5*16+2*%3
%define px esp+7*16+2*%3
%endif
mov edged, r8m
@ -311,15 +310,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
mova [px+5*%3], m5
mova [px+6*%3], m6
mova [px+7*%3], m7
mov dword [px+4*%3+%1*2], 0x7FFF7FFF
mov dword [px+5*%3+%1*2], 0x7FFF7FFF
mov dword [px+6*%3+%1*2], 0x7FFF7FFF
mov dword [px+7*%3+%1*2], 0x7FFF7FFF
mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
%endif
mov dword [px+0*%3+%1*2], 0x7FFF7FFF
mov dword [px+1*%3+%1*2], 0x7FFF7FFF
mov dword [px+2*%3+%1*2], 0x7FFF7FFF
mov dword [px+3*%3+%1*2], 0x7FFF7FFF
mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
.body_done:
; top
@ -371,8 +370,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
mova [px-1*%3-8*2], m1
mova [px-1*%3-0*2], m3
%endif
mov dword [px-2*%3+%1*2], 0x7FFF7FFF
mov dword [px-1*%3+%1*2], 0x7FFF7FFF
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
jmp .top_done
.top_no_left:
test edged, 2 ; have_right
@ -392,24 +391,24 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
%endif
mova [px-2*%3], m0
mova [px-1*%3], m1
mov dword [px-2*%3-4], 0x7FFF7FFF
mov dword [px-1*%3-4], 0x7FFF7FFF
mov dword [px-2*%3-4], OUT_OF_BOUNDS
mov dword [px-1*%3-4], OUT_OF_BOUNDS
jmp .top_done
.top_no_left_right:
PMOVZXBW m0, [top1q], %1 == 4
PMOVZXBW m1, [top2q], %1 == 4
mova [px-2*%3], m0
mova [px-1*%3], m1
mov dword [px-2*%3+%1*2], 0x7FFF7FFF
mov dword [px-1*%3+%1*2], 0x7FFF7FFF
mov dword [px-2*%3-4], 0X7FFF7FFF
mov dword [px-1*%3-4], 0X7FFF7FFF
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
mov dword [px-2*%3-4], OUT_OF_BOUNDS
mov dword [px-1*%3-4], OUT_OF_BOUNDS
jmp .top_done
.no_top:
%if ARCH_X86_64
SWAP m0, m14
%else
mova m0, [PIC_sym(pw_0x7FFF)]
mova m0, OUT_OF_BOUNDS_MEM
%endif
movu [px-2*%3-4], m0
movu [px-1*%3-4], m0
@ -455,15 +454,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
movd [px+3*%3-4], m2
jmp .left_done
.no_left:
mov dword [px+0*%3-4], 0x7FFF7FFF
mov dword [px+1*%3-4], 0x7FFF7FFF
mov dword [px+2*%3-4], 0x7FFF7FFF
mov dword [px+3*%3-4], 0x7FFF7FFF
mov dword [px+0*%3-4], OUT_OF_BOUNDS
mov dword [px+1*%3-4], OUT_OF_BOUNDS
mov dword [px+2*%3-4], OUT_OF_BOUNDS
mov dword [px+3*%3-4], OUT_OF_BOUNDS
%if %2 == 8
mov dword [px+4*%3-4], 0x7FFF7FFF
mov dword [px+5*%3-4], 0x7FFF7FFF
mov dword [px+6*%3-4], 0x7FFF7FFF
mov dword [px+7*%3-4], 0x7FFF7FFF
mov dword [px+4*%3-4], OUT_OF_BOUNDS
mov dword [px+5*%3-4], OUT_OF_BOUNDS
mov dword [px+6*%3-4], OUT_OF_BOUNDS
mov dword [px+7*%3-4], OUT_OF_BOUNDS
%endif
.left_done:
@ -513,10 +512,10 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
mova [px+(%2+0)*%3-0*2], m2
mova [px+(%2+1)*%3-8*2], m1
mova [px+(%2+1)*%3-0*2], m3
mov dword [px+(%2-1)*%3+8*2], 0x7FFF7FFF ; overwritten by first mova
mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS ; overwritten by first mova
%endif
mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF
mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF
mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
jmp .bottom_done
.bottom_no_left:
test edged, 2 ; have_right
@ -536,24 +535,24 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
%endif
mova [px+(%2+0)*%3], m0
mova [px+(%2+1)*%3], m1
mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF
mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF
mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
jmp .bottom_done
.bottom_no_left_right:
PMOVZXBW m0, [dst8q+strideq*0], %1 == 4
PMOVZXBW m1, [dst8q+strideq*1], %1 == 4
mova [px+(%2+0)*%3], m0
mova [px+(%2+1)*%3], m1
mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF
mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF
mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF
mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF
mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
jmp .bottom_done
.no_bottom:
%if ARCH_X86_64
SWAP m0, m14
%else
mova m0, [PIC_sym(pw_0x7FFF)]
mova m0, OUT_OF_BOUNDS_MEM
%endif
movu [px+(%2+0)*%3-4], m0
movu [px+(%2+1)*%3-4], m0
@ -592,47 +591,74 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
cmovl pridmpd, dampingd
neg secdmpd
cmovl secdmpd, dampingd
%if ARCH_X86_64
mov [rsp+ 0], pridmpq ; pri_shift
mov [rsp+16], secdmpq ; sec_shift
%if ARCH_X86_32
mov dword [esp+ 4], 0 ; zero upper 32 bits of psraw
mov dword [esp+20], 0 ; source operand in ACCUMULATE_TAP
%define PIC_reg r6
%else
mov [esp+0x00], pridmpd
mov [esp+0x30], secdmpd
mov dword [esp+0x04], 0 ; zero upper 32 bits of psrlw
mov dword [esp+0x34], 0 ; source operand in ACCUMULATE_TAP
%define PIC_reg r4
LOAD_PIC_REG 8
%endif
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, tap, dummy, pri, sec
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
lea tableq, [PIC_sym(tap_table)]
%if ARCH_X86_64
mova m14, [pw_256]
%else
%define m14 [PIC_sym(pw_256)]
SWAP m2, m11
SWAP m3, m12
%endif
movd m2, [tableq+pridmpq]
movd m3, [tableq+secdmpq]
pshufb m2, m15 ; pri_shift_mask
pshufb m3, m15 ; sec_shift_mask
%if ARCH_X86_64
SWAP m2, m11
SWAP m3, m12
%else
%define PIC_reg r6
mov PIC_reg, r4
DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
LOAD_ARG pri
LOAD_ARG dir, 1
mova [esp+0x10], m2
mova [esp+0x40], m3
%endif
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dummy, tap, pri, sec
movd m0, prid
movd m1, secd
pshufb m0, m14
pshufb m1, m14
%if ARCH_X86_32
mova m2, [PIC_sym(pw_0x7FFF)]
pandn m0, m2
pandn m1, m2
%if ARCH_X86_64
pshufb m0, m15
pshufb m1, m15
%else
mova m2, m15
mova m3, [PIC_sym(pb_0xFF)]
pshufb m0, m2
pshufb m1, m2
pxor m0, m3
pxor m1, m3
mova [esp+0x20], m0
mova [esp+0x30], m1
mova [esp+0x50], m1
%endif
and prid, 1
lea tapq, [PIC_sym(tap_table)]
lea priq, [tapq+priq*4] ; pri_taps
lea secq, [tapq+8] ; sec_taps
lea priq, [tapq+8+priq*2] ; pri_taps
lea secq, [tapq+12] ; sec_taps
%if ARCH_X86_64 && cpuflag(sse4)
mova m14, [shufb_lohi]
%endif
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, tap, dir, pri, sec
DEFINE_ARGS dst, stride, dir, tap, pri, sec
%if ARCH_X86_64
mov dird, r6m
lea tapq, [tapq+dirq*2+12]
lea dirq, [tapq+14+dirq*2]
DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
%else
LOAD_ARG dir, 1
lea tapd, [tapd+dird*2+12]
lea dird, [tapd+14+dird*2]
DEFINE_ARGS dst, stride, dir, stk, pri, sec
%define hd dword [esp+8]
%define offq dstq
@ -640,9 +666,9 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
%endif
mov hd, %1*%2*2/mmsize
lea stkq, [px]
movif32 [esp+0x1C], strided
movif32 [esp+0x3C], strided
.v_loop:
movif32 [esp+0x18], dstd
movif32 [esp+0x38], dstd
mov kq, 1
%if %1 == 4
movq m4, [stkq+%3*0]
@ -652,7 +678,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
%endif
%if ARCH_X86_32
%xdefine m11 m6
%xdefine m9 m3
%xdefine m13 m7
%xdefine m7 m0
%xdefine m8 m1
@ -663,36 +689,41 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
mova m8, m4 ; min
.k_loop:
%if ARCH_X86_64
movd m2, [priq+kq*2] ; pri_taps
movd m3, [secq+kq*2] ; sec_taps
pshufb m2, m14
pshufb m3, m14
ACCUMULATE_TAP 0*2, [rsp+ 0], m0, m2, %1, %3
ACCUMULATE_TAP 2*2, [rsp+16], m1, m3, %1, %3
ACCUMULATE_TAP 6*2, [rsp+16], m1, m3, %1, %3
movd m2, [priq+kq] ; pri_taps
movd m3, [secq+kq] ; sec_taps
pshufb m2, m15
pshufb m3, m15
ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
%else
movd m2, [priq+kq*2] ; pri_taps
pshufb m2, m14
ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x20], m2, %1, %3
movd m2, [priq+kq] ; pri_taps
pshufb m2, m15
ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
movd m2, [secq+kq*2] ; sec_taps
pshufb m2, m14
ACCUMULATE_TAP 2*2, [esp+0x10], [esp+0x30], m2, %1, %3
ACCUMULATE_TAP 6*2, [esp+0x10], [esp+0x30], m2, %1, %3
movd m2, [secq+kq] ; sec_taps
pshufb m2, m15
ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
%endif
dec kq
jge .k_loop
pcmpgtw m11, m15, m13
paddw m13, m11
%if cpuflag(sse4)
pcmpgtw m6, m15, m13
%else
pxor m6, m6
pcmpgtw m6, m13
%endif
paddw m13, m6
pmulhrsw m13, [PIC_sym(pw_2048)]
paddw m4, m13
pminsw m4, m7
pmaxsw m4, m8
packuswb m4, m4
movif32 dstd, [esp+0x18]
movif32 strided, [esp+0x1C]
movif32 dstd, [esp+0x38]
movif32 strided, [esp+0x3C]
%if %1 == 4
movd [dstq+strideq*0], m4
psrlq m4, 32
@ -715,11 +746,10 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
RET
%endmacro
cdef_filter_fn 8, 8, 32
cdef_filter_fn 4, 8, 32
cdef_filter_fn 4, 4, 32
%macro MULLD 2
%if cpuflag(sse4)
pmulld %1, %2
%else
%if ARCH_X86_32
%define m15 m1
%endif
@ -727,10 +757,12 @@ cdef_filter_fn 4, 4, 32
pmullw %1, %2
pslld m15, 16
paddd %1, m15
%endif
%endmacro
%if ARCH_X86_64
cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
%macro CDEF_DIR 0
%if ARCH_X86_64
cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
lea stride3q, [strideq*3]
movq m1, [srcq+strideq*0]
movhps m1, [srcq+strideq*1]
@ -785,7 +817,7 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
pmaddwd m9, m9
phaddd m9, m8
SWAP m8, m9
MULLD m8, [div_table+48]
MULLD m8, [div_table%+SUFFIX+48]
pslldq m9, m1, 2
psrldq m10, m1, 14
@ -819,8 +851,8 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
punpcklwd m9, m10
pmaddwd m11, m11
pmaddwd m9, m9
MULLD m11, [div_table+16]
MULLD m9, [div_table+0]
MULLD m11, [div_table%+SUFFIX+16]
MULLD m9, [div_table%+SUFFIX+0]
paddd m9, m11 ; cost[0a-d]
pslldq m10, m0, 14
@ -855,8 +887,8 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
punpcklwd m10, m11
pmaddwd m12, m12
pmaddwd m10, m10
MULLD m12, [div_table+16]
MULLD m10, [div_table+0]
MULLD m12, [div_table%+SUFFIX+16]
MULLD m10, [div_table%+SUFFIX+0]
paddd m10, m12 ; cost[4a-d]
phaddd m9, m10 ; cost[0a/b,4a/b]
@ -881,14 +913,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
paddw m4, m6
paddw m5, m15 ; partial_sum_alt[3] right
paddw m4, m14 ; partial_sum_alt[3] left
pshuflw m5, m5, q3012
punpckhwd m6, m4, m5
punpcklwd m4, m5
pmaddwd m6, m6
pshuflw m6, m5, q3012
punpckhwd m5, m4
punpcklwd m4, m6
pmaddwd m5, m5
pmaddwd m4, m4
MULLD m6, [div_table+48]
MULLD m4, [div_table+32]
paddd m4, m6 ; cost[7a-d]
MULLD m5, [div_table%+SUFFIX+48]
MULLD m4, [div_table%+SUFFIX+32]
paddd m4, m5 ; cost[7a-d]
pslldq m5, m10, 6
psrldq m6, m10, 10
@ -901,14 +933,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
paddw m5, m11
paddw m6, m12
paddw m5, m13
pshuflw m6, m6, q3012
punpckhwd m7, m5, m6
punpcklwd m5, m6
pmaddwd m7, m7
pshuflw m7, m6, q3012
punpckhwd m6, m5
punpcklwd m5, m7
pmaddwd m6, m6
pmaddwd m5, m5
MULLD m7, [div_table+48]
MULLD m5, [div_table+32]
paddd m5, m7 ; cost[5a-d]
MULLD m6, [div_table%+SUFFIX+48]
MULLD m5, [div_table%+SUFFIX+32]
paddd m5, m6 ; cost[5a-d]
pslldq m6, m1, 2
psrldq m7, m1, 14
@ -921,14 +953,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
paddw m6, m10
paddw m7, m13 ; partial_sum_alt[3] right
paddw m6, m12 ; partial_sum_alt[3] left
pshuflw m7, m7, q3012
punpckhwd m10, m6, m7
punpcklwd m6, m7
pmaddwd m10, m10
pshuflw m10, m7, q3012
punpckhwd m7, m6
punpcklwd m6, m10
pmaddwd m7, m7
pmaddwd m6, m6
MULLD m10, [div_table+48]
MULLD m6, [div_table+32]
paddd m6, m10 ; cost[1a-d]
MULLD m7, [div_table%+SUFFIX+48]
MULLD m6, [div_table%+SUFFIX+32]
paddd m6, m7 ; cost[1a-d]
pshufd m0, m0, q1032
pshufd m1, m1, q1032
@ -946,61 +978,62 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
paddw m10, m14
paddw m11, m2
paddw m10, m3
pshuflw m11, m11, q3012
punpckhwd m12, m10, m11
punpcklwd m10, m11
pmaddwd m12, m12
pshuflw m12, m11, q3012
punpckhwd m11, m10
punpcklwd m10, m12
pmaddwd m11, m11
pmaddwd m10, m10
MULLD m12, [div_table+48]
MULLD m10, [div_table+32]
paddd m10, m12 ; cost[3a-d]
MULLD m11, [div_table%+SUFFIX+48]
MULLD m10, [div_table%+SUFFIX+32]
paddd m10, m11 ; cost[3a-d]
phaddd m0, m9, m8 ; cost[0,4,2,6]
phaddd m6, m5
phaddd m10, m4
phaddd m1, m6, m10 ; cost[1,5,3,7]
phaddd m9, m8 ; cost[0,4,2,6]
phaddd m6, m10
phaddd m5, m4
phaddd m6, m5 ; cost[1,3,5,7]
pshufd m4, m9, q3120
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
pand m3, m2, m1
pandn m4, m2, m0
por m3, m4 ; higher 4 values
pshufd m1, m1, q2301
pshufd m0, m0, q2301
pand m1, m2, m1
pandn m4, m2, m0
por m0, m4, m1 ; 4 values at idx^4 offset
pand m14, m2, [pd_0to7+16]
pandn m15, m2, [pd_0to7]
por m15, m14
; now find the best cost
%if cpuflag(sse4)
pmaxsd m9, m6
pshufd m0, m9, q1032
pmaxsd m0, m9
pshufd m1, m0, q2301
pmaxsd m0, m1 ; best cost
%else
pcmpgtd m0, m9, m6
pand m9, m0
pandn m0, m6
por m9, m0
pshufd m1, m9, q1032
pcmpgtd m0, m9, m1
pand m9, m0
pandn m0, m1
por m9, m0
pshufd m1, m9, q2301
pcmpgtd m0, m9, m1
pand m9, m0
pandn m0, m1
por m0, m9
%endif
punpckhqdq m4, m3, m0
punpcklqdq m3, m0
pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5]
punpcklqdq m5, m5
pand m6, m5, m4
pandn m7, m5, m3
por m6, m7 ; { highest 2 values, complements at idx^4 }
movhlps m14, m15
pand m14, m5, m14
pandn m13, m5, m15
por m15, m13, m14
pshufd m7, m6, q3311
pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3]
punpcklqdq m8, m8
pand m9, m8, m7
pandn m10, m8, m6
por m9, m10 ; max
movhlps m10, m9 ; complement at idx^4
psubd m9, m10
psrld m9, 10
movd [varq], m9
pshufd m14, m15, q1111
pand m14, m8, m14
pandn m13, m8, m15
por m15, m13, m14
movd eax, m15
%else
; get direction and variance
punpckhdq m1, m4, m6
punpckldq m4, m6
psubd m2, m0, m1
psubd m3, m0, m4
mova [rsp+0x00], m2 ; emulate ymm in stack
mova [rsp+0x10], m3
pcmpeqd m1, m0 ; compute best cost mask
pcmpeqd m4, m0
packssdw m4, m1
pmovmskb eax, m4 ; get byte-idx from mask
tzcnt eax, eax
mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm
shr eax, 1 ; get direction by converting byte-idx to word-idx
shr r1d, 10
mov [varq], r1d
%else
cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
%define PIC_reg r4
LEA PIC_reg, PIC_base_offset
@ -1065,7 +1098,7 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
pmaddwd m0, m0
phaddd m2, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
mova [esp+0x30], m2
mova m1, [esp+0x10]
@ -1103,8 +1136,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+16]
MULLD m0, [PIC_sym(div_table)+0]
MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
paddd m0, m2 ; cost[0a-d]
mova [esp+0x40], m0
@ -1144,8 +1177,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+16]
MULLD m0, [PIC_sym(div_table)+0]
MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
paddd m0, m2 ; cost[4a-d]
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
@ -1181,8 +1214,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
paddd m0, m2 ; cost[7a-d]
mova [esp+0x40], m0
@ -1197,44 +1230,44 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
paddw m0, m1
paddw m7, m4
paddw m0, m2
pshuflw m7, m7, q3012
punpckhwd m2, m0, m7
punpcklwd m0, m7
pmaddwd m2, m2
pshuflw m2, m7, q3012
punpckhwd m7, m0
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
paddd m0, m2 ; cost[5a-d]
MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
paddd m0, m7 ; cost[5a-d]
mova [esp+0x50], m0
mova m1, [esp+0x10]
mova m7, [esp+0x10]
mova m2, [esp+0x20]
pslldq m0, m1, 2
psrldq m1, 14
pslldq m0, m7, 2
psrldq m7, 14
pslldq m4, m2, 4
psrldq m2, 12
pslldq m5, m3, 6
psrldq m6, m3, 10
paddw m0, [esp+0x00]
paddw m1, m2
paddw m7, m2
paddw m4, m5
paddw m1, m6 ; partial_sum_alt[3] right
paddw m7, m6 ; partial_sum_alt[3] right
paddw m0, m4 ; partial_sum_alt[3] left
pshuflw m1, m1, q3012
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pshuflw m2, m7, q3012
punpckhwd m7, m0
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
MULLD m2, [PIC_sym(div_table)+48]
MULLD m0, [PIC_sym(div_table)+32]
paddd m0, m2 ; cost[1a-d]
phaddd m0, [esp+0x50]
mova [esp+0x50], m0
MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
paddd m0, m7 ; cost[1a-d]
SWAP m0, m4
pshufd m0, [esp+0x00], q1032
pshufd m1, [esp+0x10], q1032
pshufd m2, [esp+0x20], q1032
pshufd m3, m3, q1032
mova [esp+0x00], m4
pslldq m4, m0, 6
psrldq m0, 10
@ -1247,60 +1280,76 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
paddw m5, m6
paddw m0, m2
paddw m4, m5
pshuflw m0, m0, q3012
punpckhwd m2, m4, m0
punpcklwd m4, m0
pmaddwd m2, m2
pshuflw m2, m0, q3012
punpckhwd m0, m4
punpcklwd m4, m2
pmaddwd m0, m0
pmaddwd m4, m4
MULLD m2, [PIC_sym(div_table)+48]
MULLD m4, [PIC_sym(div_table)+32]
paddd m4, m2 ; cost[3a-d]
phaddd m4, [esp+0x40]
MULLD m0, [PIC_sym(div_table%+SUFFIX)+48]
MULLD m4, [PIC_sym(div_table%+SUFFIX)+32]
paddd m4, m0 ; cost[3a-d]
mova m1, [esp+0x50]
mova m1, [esp+0x00]
mova m2, [esp+0x50]
mova m0, [esp+0x30] ; cost[0,4,2,6]
phaddd m1, m4 ; cost[1,5,3,7]
phaddd m1, m4
phaddd m2, [esp+0x40] ; cost[1,3,5,7]
phaddd m1, m2
pshufd m2, m0, q3120
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
pand m3, m2, m1
pandn m4, m2, m0
por m3, m4 ; higher 4 values
pshufd m1, m1, q2301
pshufd m0, m0, q2301
pand m1, m2, m1
pandn m4, m2, m0
por m0, m4, m1 ; 4 values at idx^4 offset
pand m5, m2, [PIC_sym(pd_0to7)+16]
pandn m6, m2, [PIC_sym(pd_0to7)]
por m6, m5
; now find the best cost
%if cpuflag(sse4)
pmaxsd m0, m1
pshufd m3, m0, q1032
pmaxsd m3, m0
pshufd m0, m3, q2301
pmaxsd m0, m3
%else
pcmpgtd m3, m0, m1
pand m0, m3
pandn m3, m1
por m0, m3
pshufd m4, m0, q1032
pcmpgtd m3, m0, m4
pand m0, m3
pandn m3, m4
por m0, m3
pshufd m4, m0, q2301
pcmpgtd m3, m0, m4
pand m0, m3
pandn m3, m4
por m0, m3
%endif
punpckhqdq m4, m3, m0
punpcklqdq m3, m0
pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5]
punpcklqdq m0, m0
pand m1, m0, m4
pandn m7, m0, m3
por m1, m7 ; { highest 2 values, complements at idx^4 }
movhlps m5, m6
pand m5, m0, m5
pandn m3, m0, m6
por m6, m3, m5
pshufd m7, m1, q3311
pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3]
punpcklqdq m2, m2
pand m0, m2, m7
pandn m7, m2, m1
por m0, m7 ; max
movhlps m7, m0 ; complement at idx^4
psubd m0, m7
psrld m0, 10
movd [varq], m0
pshufd m5, m6, q1111
pand m5, m2, m5
pandn m3, m2, m6
por m6, m3, m5
movd eax, m6
%endif
; get direction and variance
punpckhdq m3, m2, m1
punpckldq m2, m1
psubd m1, m0, m3
psubd m4, m0, m2
mova [esp+0x00], m1 ; emulate ymm in stack
mova [esp+0x10], m4
pcmpeqd m3, m0 ; compute best cost mask
pcmpeqd m2, m0
packssdw m2, m3
pmovmskb eax, m2 ; get byte-idx from mask
tzcnt eax, eax
mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
shr eax, 1 ; get direction by converting byte-idx to word-idx
shr r1d, 10
mov [vard], r1d
%endif
RET
%endmacro
INIT_XMM sse4
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
CDEF_DIR
INIT_XMM ssse3
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
CDEF_DIR

10
third_party/dav1d/src/x86/ipred_init_tmpl.c поставляемый
Просмотреть файл

@ -58,6 +58,7 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
@ -67,6 +68,10 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
decl_pal_pred_fn(dav1d_pal_pred_ssse3);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
@ -81,6 +86,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
@ -90,6 +96,10 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
c->pal_pred = dav1d_pal_pred_ssse3;
#endif

1313
third_party/dav1d/src/x86/ipred_ssse3.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

21
third_party/dav1d/src/x86/itx_init_tmpl.c поставляемый
Просмотреть файл

@ -86,6 +86,17 @@ decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3);
decl_itx12_fns(16, 16, ssse3);
decl_itx2_fns ( 8, 32, ssse3);
decl_itx2_fns (32, 8, ssse3);
decl_itx2_fns (16, 32, ssse3);
decl_itx2_fns (32, 16, ssse3);
decl_itx2_fns (32, 32, ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@ -138,6 +149,16 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(R, 8, 16, ssse3);
assign_itx16_fn(R, 16, 8, ssse3);
assign_itx12_fn(, 16, 16, ssse3);
assign_itx2_fn (R, 8, 32, ssse3);
assign_itx2_fn (R, 32, 8, ssse3);
assign_itx2_fn (R, 16, 32, ssse3);
assign_itx2_fn (R, 32, 16, ssse3);
assign_itx2_fn (, 32, 32, ssse3);
assign_itx1_fn (R, 16, 64, ssse3);
assign_itx1_fn (R, 32, 64, ssse3);
assign_itx1_fn (R, 64, 16, ssse3);
assign_itx1_fn (R, 64, 32, ssse3);
assign_itx1_fn ( , 64, 64, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

3402
third_party/dav1d/src/x86/itx_ssse3.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

287
third_party/dav1d/src/x86/msac.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,287 @@
; Copyright © 2019, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64 ; avoids cacheline splits
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
pw_0xff00: times 8 dw 0xff00
pw_32: times 8 dw 32
struc msac
.buf: resq 1
.end: resq 1
.dif: resq 1
.rng: resd 1
.cnt: resd 1
.update_cdf: resd 1
endstruc
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
SECTION .text
%if WIN64
DECLARE_REG_TMP 3
%define buf rsp+8 ; shadow space
%else
DECLARE_REG_TMP 0
%define buf rsp-40 ; red zone
%endif
INIT_XMM sse2
cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movq m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
pand m2, [rax]
mova m0, m1
psrlw m1, 6
psllw m1, 7
pmulhuw m1, m2
movq m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2 ; c >= v
pmovmskb eax, m1
test r3d, r3d
jz .renorm ; !allow_update_cdf
; update_cdf:
movzx r3d, word [cdfq+r4*2] ; count
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4
sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
cmp r2d, 32
adc r2d, 0 ; count + (count < 32)
movd m3, r3d
pavgw m2, m1 ; i >= val ? -1 : 32768
psubw m2, m0 ; for (i = 0; i < val; i++)
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
movq [cdfq], m0
mov [cdfq+r4*2], r2w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax+16] ; v
movzx r2d, word [buf+rax+14] ; u
shr eax, 1
.renorm2:
not r4
sub r2d, r1d ; rng
shl r1, 48
add r4, r1 ; ~dif
mov r1d, [sq+msac.cnt]
movifnidn t0, sq
bsr ecx, r2d
xor ecx, 15 ; d
shl r2d, cl
shl r4, cl
mov [t0+msac.rng], r2d
not r4
sub r1d, ecx
jge .end ; no refill required
; refill:
mov r2, [t0+msac.buf]
mov rcx, [t0+msac.end]
lea r5, [r2+8]
cmp r5, rcx
jg .refill_eob
mov r2, [r2]
lea ecx, [r1+23]
add r1d, 16
shr ecx, 3 ; shift_bytes
bswap r2
sub r5, rcx
shl ecx, 3 ; shift_bits
shr r2, cl
sub ecx, r1d ; shift_bits - 16 - cnt
mov r1d, 48
shl r2, cl
mov [t0+msac.buf], r5
sub r1d, ecx ; cnt + 64 - shift_bits
xor r4, r2
.end:
mov [t0+msac.cnt], r1d
mov [t0+msac.dif], r4
RET
.refill_eob: ; avoid overreading the input buffer
mov r5, rcx
mov ecx, 40
sub ecx, r1d ; c
.refill_eob_loop:
cmp r2, r5
jge .refill_eob_end ; eob reached
movzx r1d, byte [r2]
inc r2
shl r1, cl
xor r4, r1
sub ecx, 8
jge .refill_eob_loop
.refill_eob_end:
mov r1d, 40
sub r1d, ecx
mov [t0+msac.buf], r2
mov [t0+msac.dif], r4
mov [t0+msac.cnt], r1d
RET
cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movu m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
punpcklqdq m2, m2
mova m0, m1
psrlw m1, 6
pand m2, [rax]
psllw m1, 7
pmulhuw m1, m2
movu m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
punpcklqdq m3, m3
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2
pmovmskb eax, m1
test r3d, r3d
jz m(msac_decode_symbol_adapt4).renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4 ; may be called with n_symbols < 4
sbb r3d, -5
cmp r2d, 32
adc r2d, 0
movd m3, r3d
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, m3
paddw m0, m2
movu [cdfq], m0
mov [cdfq+r4*2], r2w
jmp m(msac_decode_symbol_adapt4).renorm
cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
movd m4, [sq+msac.rng]
movu m2, [cdfq]
lea rax, [pw_0xff00]
movu m3, [cdfq+16]
movq m5, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
%if WIN64
sub rsp, 48 ; need 36 bytes, shadow space is only 32
%endif
pshuflw m4, m4, q0000
movd [buf-4], m4
punpcklqdq m4, m4
mova m0, m2
psrlw m2, 6
mova m1, m3
psrlw m3, 6
pand m4, [rax]
psllw m2, 7
psllw m3, 7
pmulhuw m2, m4
pmulhuw m3, m4
movu m4, [rax+nsq*2]
pshuflw m5, m5, q3333
paddw m2, m4
psubw m4, [rax-pw_0xff00+pw_32]
punpcklqdq m5, m5
paddw m3, m4
mova [buf], m2
mova [buf+16], m3
psubusw m2, m5
psubusw m3, m5
pxor m4, m4
pcmpeqw m2, m4
pcmpeqw m3, m4
packsswb m5, m2, m3
pmovmskb eax, m5
test r3d, r3d
jz .renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m4, m4
mova m5, m4
lea r2d, [r3+80] ; only support n_symbols >= 4
shr r2d, 4
cmp r3d, 32
adc r3d, 0
pavgw m4, m2
pavgw m5, m3
psubw m4, m0
psubw m0, m2
movd m2, r2d
psubw m5, m1
psubw m1, m3
psraw m4, m2
psraw m5, m2
paddw m0, m4
paddw m1, m5
movu [cdfq], m0
movu [cdfq+16], m1
mov [cdfq+r4*2], r3w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax*2]
movzx r2d, word [buf+rax*2-2]
%if WIN64
add rsp, 48
%endif
jmp m(msac_decode_symbol_adapt4).renorm2
%endif

1
third_party/dav1d/tests/checkasm/checkasm.c поставляемый
Просмотреть файл

@ -62,6 +62,7 @@ static const struct {
const char *name;
void (*func)(void);
} tests[] = {
{ "msac", checkasm_check_msac },
#if CONFIG_8BPC
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
{ "ipred_8bpc", checkasm_check_ipred_8bpc },

1
third_party/dav1d/tests/checkasm/checkasm.h поставляемый
Просмотреть файл

@ -57,6 +57,7 @@ int xor128_rand(void);
name##_8bpc(void); \
name##_16bpc(void)
void checkasm_check_msac(void);
decl_check_bitfns(void checkasm_check_cdef);
decl_check_bitfns(void checkasm_check_ipred);
decl_check_bitfns(void checkasm_check_itx);

115
third_party/dav1d/tests/checkasm/msac.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,115 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/cpu.h"
#include "src/msac.h"
#include <string.h>
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
typedef struct {
decode_symbol_adapt_fn symbol_adapt4;
decode_symbol_adapt_fn symbol_adapt8;
decode_symbol_adapt_fn symbol_adapt16;
} MsacDSPContext;
static void randomize_cdf(uint16_t *const cdf, int n) {
for (int i = 16; i > n; i--)
cdf[i] = rnd(); /* randomize padding */
cdf[n] = cdf[n-1] = 0;
while (--n > 0)
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
}
/* memcmp() on structs can have weird behavior due to padding etc. */
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
a->allow_update_cdf != b->allow_update_cdf;
}
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
for (int ns = n_min; ns <= n_max; ns++) { \
dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update); \
s_a = s_c; \
randomize_cdf(cdf[0], ns); \
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
for (int i = 0; i < 64; i++) { \
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
unsigned a_res = call_new(&s_a, cdf[1], ns); \
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
{ \
fail(); \
} \
} \
if (cdf_update && ns == n) \
bench_new(&s_a, cdf[0], n); \
} \
} \
} \
} while (0)
static void check_decode_symbol_adapt(MsacDSPContext *const c) {
/* Use an aligned CDF buffer for more consistent benchmark
* results, and a misaligned one for checking correctness. */
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
MsacContext s_c, s_a;
uint8_t buf[1024];
for (int i = 0; i < 1024; i++)
buf[i] = rnd();
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
CHECK_SYMBOL_ADAPT( 4, 1, 5);
CHECK_SYMBOL_ADAPT( 8, 1, 8);
CHECK_SYMBOL_ADAPT(16, 4, 16);
report("decode_symbol_adapt");
}
void checkasm_check_msac(void) {
MsacDSPContext c;
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
#if ARCH_X86_64 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
}
#endif
check_decode_symbol_adapt(&c);
}

5
third_party/dav1d/tests/meson.build поставляемый
Просмотреть файл

@ -34,7 +34,10 @@ endif
libdav1d_nasm_objs_if_needed = []
if is_asm_enabled
checkasm_sources = files('checkasm/checkasm.c')
checkasm_sources = files(
'checkasm/checkasm.c',
'checkasm/msac.c',
)
checkasm_tmpl_sources = files(
'checkasm/cdef.c',