зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1540830 - Update dav1d from upstream to 1f7a7e8. r=TD-Linux
Differential Revision: https://phabricator.services.mozilla.com/D28200 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
931da4b767
Коммит
d1bd6b015b
|
@ -20,7 +20,7 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z).
|
||||
release: commit 1f7a7e8a6af739a05b320151d04f0f7509ae7579 (2019-04-19T07:16:39.000Z).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.2.2"
|
||||
#define DAV1D_VERSION "1f7a7e8a6af739a05b320151d04f0f7509ae7579"
|
||||
|
|
|
@ -12,6 +12,12 @@ style-check:
|
|||
script:
|
||||
- git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
|
||||
- git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
|
||||
- for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
|
||||
if [ -n "$(tail -c 1 "$i")" ]; then
|
||||
echo "No newline at end of $i";
|
||||
exit 1;
|
||||
fi;
|
||||
done
|
||||
- git remote rm upstream 2> /dev/null || true
|
||||
- git remote add upstream https://code.videolan.org/videolan/dav1d.git
|
||||
- git fetch -q upstream master
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
Changes for 0.2.2 'Antelope':
|
||||
----------------------------
|
||||
|
||||
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
|
||||
The impact is important on SSSE3, SSE4 and AVX-2 cpus
|
||||
- SSSE3 optimizations for all blocks size in itx
|
||||
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
|
||||
- Speed improvements on CDEF for SSE4 CPUs
|
||||
- NEON optimizations for SGR and loop filter
|
||||
- Minor crashes, improvements and build changes
|
||||
|
||||
|
||||
Changes for 0.2.1 'Antelope':
|
||||
----------------------------
|
||||
|
|
|
@ -33,7 +33,11 @@
|
|||
|
||||
#ifndef DAV1D_API
|
||||
#if defined _WIN32
|
||||
#if defined DAV1D_BUILDING_DLL
|
||||
#define DAV1D_API __declspec(dllexport)
|
||||
#else
|
||||
#define DAV1D_API
|
||||
#endif
|
||||
#else
|
||||
#if __GNUC__ >= 4
|
||||
#define DAV1D_API __attribute__ ((visibility ("default")))
|
||||
|
|
|
@ -217,8 +217,8 @@ bidir_fn mask
|
|||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// assumes that the caller has loaded the h argument into r5,
|
||||
// and assumes that r8 is set to (24-clz(w)).
|
||||
function put
|
||||
// and assumes that r8 is set to (clz(w)-24).
|
||||
function put_neon
|
||||
adr r9, L(put_tbl)
|
||||
ldr r8, [r9, r8, lsl #2]
|
||||
add r9, r9, r8
|
||||
|
@ -307,9 +307,9 @@ endfunc
|
|||
|
||||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// assumes that the caller has loaded the h argument into r5,
|
||||
// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
|
||||
function prep
|
||||
// assumes that the caller has loaded the h argument into r4,
|
||||
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
|
||||
function prep_neon
|
||||
adr r9, L(prep_tbl)
|
||||
ldr r8, [r9, r8, lsl #2]
|
||||
add r9, r9, r8
|
||||
|
@ -660,7 +660,7 @@ function \op\()_8tap_\type\()_8bpc_neon, export=1
|
|||
push {r4-r11,lr}
|
||||
movw r8, \type_h
|
||||
movw r9, \type_v
|
||||
b \op\()_8tap
|
||||
b \op\()_8tap_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -680,7 +680,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
|
|||
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
|
||||
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
|
||||
|
||||
function \type\()_8tap
|
||||
function \type\()_8tap_neon
|
||||
ldrd r4, r5, [sp, #36]
|
||||
ldrd r6, r7, [sp, #44]
|
||||
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
|
||||
|
@ -699,7 +699,7 @@ function \type\()_8tap
|
|||
bne L(\type\()_8tap_h)
|
||||
tst \my, #(0x7f << 14)
|
||||
bne L(\type\()_8tap_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_8tap_h):
|
||||
cmp \w, #4
|
||||
|
@ -1831,7 +1831,7 @@ function \type\()_bilin_8bpc_neon, export=1
|
|||
bne L(\type\()_bilin_h)
|
||||
cmp \my, #0
|
||||
bne L(\type\()_bilin_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_bilin_h):
|
||||
cmp \my, #0
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -235,8 +235,8 @@ bidir_fn mask
|
|||
|
||||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// and assumes that x8 is set to (24-clz(w)).
|
||||
function put
|
||||
// and assumes that x8 is set to (clz(w)-24).
|
||||
function put_neon
|
||||
adr x9, L(put_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
|
@ -330,8 +330,8 @@ endfunc
|
|||
|
||||
|
||||
// This has got the same signature as the prep_8tap functions,
|
||||
// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
|
||||
function prep
|
||||
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
|
||||
function prep_neon
|
||||
adr x9, L(prep_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
|
@ -703,7 +703,7 @@ endfunc
|
|||
function \op\()_8tap_\type\()_8bpc_neon, export=1
|
||||
mov x8, \type_h
|
||||
mov x9, \type_v
|
||||
b \op\()_8tap
|
||||
b \op\()_8tap\()_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -723,7 +723,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
|
|||
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
|
||||
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
|
||||
|
||||
function \type\()_8tap
|
||||
function \type\()_8tap_neon
|
||||
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
|
||||
mul \mx, \mx, w10
|
||||
mul \my, \my, w10
|
||||
|
@ -741,7 +741,7 @@ function \type\()_8tap
|
|||
b.ne L(\type\()_8tap_h)
|
||||
tst \my, #(0x7f << 14)
|
||||
b.ne L(\type\()_8tap_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_8tap_h):
|
||||
cmp \w, #4
|
||||
|
@ -1826,7 +1826,7 @@ function \type\()_bilin_8bpc_neon, export=1
|
|||
sub w8, w8, #24
|
||||
cbnz \mx, L(\type\()_bilin_h)
|
||||
cbnz \my, L(\type\()_bilin_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_bilin_h):
|
||||
cbnz \my, L(\type\()_bilin_hv)
|
||||
|
@ -2335,7 +2335,7 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
|
|||
add \src, \src, \inc
|
||||
.endm
|
||||
|
||||
function warp_filter_horz
|
||||
function warp_filter_horz_neon
|
||||
add w12, w5, #512
|
||||
|
||||
ld1 {v16.8b, v17.8b}, [x2], x3
|
||||
|
@ -2431,24 +2431,24 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
lsl x1, x1, #1
|
||||
.endif
|
||||
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v24.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v25.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v26.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v27.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v28.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v29.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v30.16b, v16.16b
|
||||
|
||||
1:
|
||||
add w14, w6, #512
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v31.16b, v16.16b
|
||||
|
||||
load_filter_row d0, w14, w9
|
||||
|
|
|
@ -88,4 +88,45 @@
|
|||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \r9\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \r9\().8h, \r3\().8h
|
||||
trn2 \r9\().8h, \r9\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \r8\().8h, \r1\().8h
|
||||
trn2 \r8\().8h, \r8\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r6\().4s, \r8\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \r8\().4s, \r2\().4s
|
||||
trn1 \r3\().4s, \r9\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r9\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
|
||||
|
||||
void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
|
||||
#endif
|
||||
}
|
|
@ -29,6 +29,7 @@
|
|||
#include "src/looprestoration.h"
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
// This calculates things slightly differently than the reference C version.
|
||||
|
@ -91,7 +92,171 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
|||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ARCH_AARCH64
|
||||
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter1_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 3x3 box (radius=1) */
|
||||
static void dav1d_sgr_filter1_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 1, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter2_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 5x5 box (radius=2) */
|
||||
static void dav1d_sgr_filter2_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const int w, const int h,
|
||||
const int wt);
|
||||
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const coef *t2,
|
||||
const int w, const int h,
|
||||
const int16_t wt[2]);
|
||||
|
||||
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int sgr_idx,
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
|
||||
{
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h,
|
||||
(1 << 7) - sgr_wt[1]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, sgr_wt[0]);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else {
|
||||
ALIGN_STK_16(coef, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(coef, tmp2, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w & ~7, h, wt);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp1 + (w & ~7), tmp2 + (w & ~7),
|
||||
w & 7, h, wt);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ARCH_AARCH64
|
||||
#endif // BITDEPTH == 8
|
||||
|
||||
void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
@ -100,5 +265,8 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *
|
|||
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -813,7 +813,7 @@ static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = {
|
|||
AOM_CDF4(4096, 11264, 19328)
|
||||
};
|
||||
|
||||
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
|
||||
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
|
||||
{
|
||||
{ AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
|
||||
24189, 28165, 29093, 30466) },
|
||||
|
|
|
@ -34,11 +34,13 @@
|
|||
#include "src/ref.h"
|
||||
#include "src/thread_data.h"
|
||||
|
||||
/* Buffers padded to [8] or [16] for SIMD where needed. */
|
||||
|
||||
typedef struct CdfModeContext {
|
||||
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
|
||||
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
|
||||
uint16_t use_filter_intra[N_BS_SIZES][2];
|
||||
uint16_t filter_intra[5 + 1];
|
||||
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
|
||||
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
|
||||
uint16_t angle_delta[8][8];
|
||||
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
|
||||
uint16_t newmv_mode[6][2];
|
||||
|
@ -66,7 +68,7 @@ typedef struct CdfModeContext {
|
|||
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
|
||||
uint16_t skip[3][2];
|
||||
uint16_t skip_mode[3][2];
|
||||
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
|
||||
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
|
||||
uint16_t seg_pred[3][2];
|
||||
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
|
||||
uint16_t cfl_sign[8 + 1];
|
||||
|
@ -88,12 +90,12 @@ typedef struct CdfModeContext {
|
|||
typedef struct CdfCoefContext {
|
||||
uint16_t skip[N_TX_SIZES][13][2];
|
||||
uint16_t eob_bin_16[2][2][6];
|
||||
uint16_t eob_bin_32[2][2][7];
|
||||
uint16_t eob_bin_32[2][2][7 + 1];
|
||||
uint16_t eob_bin_64[2][2][8];
|
||||
uint16_t eob_bin_128[2][2][9];
|
||||
uint16_t eob_bin_256[2][2][10];
|
||||
uint16_t eob_bin_512[2][2][11];
|
||||
uint16_t eob_bin_1024[2][2][12];
|
||||
uint16_t eob_bin_256[2][2][10 + 6];
|
||||
uint16_t eob_bin_512[2][2][11 + 5];
|
||||
uint16_t eob_bin_1024[2][2][12 + 4];
|
||||
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
|
||||
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
|
||||
uint16_t base_tok[N_TX_SIZES][2][41][5];
|
||||
|
@ -102,7 +104,7 @@ typedef struct CdfCoefContext {
|
|||
} CdfCoefContext;
|
||||
|
||||
typedef struct CdfMvComponent {
|
||||
uint16_t classes[11 + 1];
|
||||
uint16_t classes[11 + 1 + 4];
|
||||
uint16_t class0[2];
|
||||
uint16_t classN[10][2];
|
||||
uint16_t class0_fp[2][4 + 1];
|
||||
|
@ -119,7 +121,7 @@ typedef struct CdfMvContext {
|
|||
|
||||
typedef struct CdfContext {
|
||||
CdfModeContext m;
|
||||
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
|
||||
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
|
||||
CdfCoefContext coef;
|
||||
CdfMvContext mv, dmv;
|
||||
} CdfContext;
|
||||
|
|
|
@ -80,14 +80,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
const Dav1dFrameContext *const f = t->f;
|
||||
const int have_hp = f->frame_hdr->hp;
|
||||
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
|
||||
const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
mv_comp->classes, 11);
|
||||
int up, fp, hp;
|
||||
|
||||
if (!cl) {
|
||||
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->class0_fp[up], 4);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->class0_hp) : 1;
|
||||
|
@ -101,7 +101,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
up |= dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->classN[n]) << n;
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->classN_fp, 4);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->classN_hp) : 1;
|
||||
|
@ -119,7 +119,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
|
||||
CdfMvContext *const mv_cdf, const int have_fp)
|
||||
{
|
||||
switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
|
||||
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
|
||||
N_MV_JOINTS))
|
||||
{
|
||||
case MV_JOINT_HV:
|
||||
|
@ -379,7 +379,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
|
|||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
|
||||
uint16_t cache[16], used_cache[8];
|
||||
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
|
||||
|
@ -595,7 +595,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
|
|||
const int last = imax(0, i - h4 * 4 + 1);
|
||||
order_palette(pal_idx, stride, i, first, last, order, ctx);
|
||||
for (int j = first, m = 0; j >= last; j--, m++) {
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl]);
|
||||
pal_idx[(i - j) * stride + j] = order[m][color_idx];
|
||||
}
|
||||
|
@ -811,7 +811,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const unsigned pred_seg_id =
|
||||
get_cur_frame_segid(t->by, t->bx, have_top, have_left,
|
||||
&seg_ctx, f->cur_segmap, f->b4_stride);
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
const unsigned last_active_seg_id =
|
||||
|
@ -883,7 +883,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (b->skip) {
|
||||
b->seg_id = pred_seg_id;
|
||||
} else {
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
const unsigned last_active_seg_id =
|
||||
|
@ -932,7 +932,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
memcpy(prev_delta_lf, ts->last_delta_lf, 4);
|
||||
|
||||
if (have_delta_q) {
|
||||
int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_q, 4);
|
||||
if (delta_q == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
|
@ -953,7 +953,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
|
||||
|
||||
for (int i = 0; i < n_lfs; i++) {
|
||||
int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
|
||||
if (delta_lf == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
|
@ -1018,7 +1018,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
|
||||
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
|
||||
[dav1d_intra_mode_context[t->l.mode[by4]]];
|
||||
b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
|
||||
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
|
||||
N_INTRA_PRED_MODES);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
|
||||
|
@ -1028,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
b->y_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
b->y_angle = angle - 3;
|
||||
} else {
|
||||
b->y_angle = 0;
|
||||
|
@ -1038,20 +1038,20 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
|
||||
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
|
||||
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
|
||||
b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
|
||||
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
|
||||
N_UV_INTRA_PRED_MODES - !cfl_allowed);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
|
||||
|
||||
if (b->uv_mode == CFL_PRED) {
|
||||
#define SIGN(a) (!!(a) + ((a) > 0))
|
||||
const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.cfl_sign, 8) + 1;
|
||||
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
|
||||
assert(sign_u == sign / 3);
|
||||
if (sign_u) {
|
||||
const int ctx = (sign_u == 2) * 3 + sign_v;
|
||||
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
|
||||
} else {
|
||||
|
@ -1059,7 +1059,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
}
|
||||
if (sign_v) {
|
||||
const int ctx = (sign_v == 2) * 3 + sign_u;
|
||||
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
|
||||
} else {
|
||||
|
@ -1073,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
b->uv_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
b->uv_angle = angle - 3;
|
||||
} else {
|
||||
b->uv_angle = 0;
|
||||
|
@ -1113,7 +1113,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->cdf.m.use_filter_intra[bs]);
|
||||
if (is_filter) {
|
||||
b->y_mode = FILTER_PRED;
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter_intra, 5);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -1156,7 +1156,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
|
||||
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
|
||||
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
|
||||
int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
|
||||
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
|
||||
imin(t_dim->max + 1, 3));
|
||||
|
||||
while (depth--) {
|
||||
|
@ -1474,7 +1474,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->tiling.col_end, ts->tiling.row_start,
|
||||
ts->tiling.row_end, f->libaom_cm);
|
||||
|
||||
b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.comp_inter_mode[ctx],
|
||||
N_COMP_INTER_PRED_MODES);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -1583,7 +1583,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.wedge_comp[ctx]);
|
||||
if (b->comp_type == COMP_INTER_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[ctx], 16);
|
||||
} else {
|
||||
b->comp_type = COMP_INTER_SEG;
|
||||
|
@ -1737,7 +1737,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.interintra[ii_sz_grp]))
|
||||
{
|
||||
b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.interintra_mode[ii_sz_grp],
|
||||
N_INTER_INTRA_PRED_MODES);
|
||||
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
|
||||
|
@ -1745,7 +1745,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.interintra_wedge[wedge_ctx]);
|
||||
if (b->interintra_type == INTER_INTRA_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[wedge_ctx], 16);
|
||||
} else {
|
||||
b->interintra_type = INTER_INTRA_NONE;
|
||||
|
@ -1778,7 +1778,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
f->frame_hdr->warp_motion && (mask[0] | mask[1]);
|
||||
|
||||
b->motion_mode = allow_warp ?
|
||||
dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.motion_mode[bs], 3) :
|
||||
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
|
||||
if (b->motion_mode == MM_WARP) {
|
||||
|
@ -1817,7 +1817,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const int comp = b->comp_type != COMP_INTER_NONE;
|
||||
const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
|
||||
by4, bx4);
|
||||
filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[0][ctx1],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
if (f->seq_hdr->dual_filter) {
|
||||
|
@ -1826,7 +1826,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
|
||||
filter[0], ctx1, ts->msac.rng);
|
||||
filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[1][ctx2],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -2021,7 +2021,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
|
|||
} else {
|
||||
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
|
||||
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
|
||||
bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
|
||||
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
|
||||
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
|
||||
(bp == PARTITION_V || bp == PARTITION_V4 ||
|
||||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
|
||||
|
@ -2365,7 +2365,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
|
|||
Dav1dTileState *const ts = t->ts;
|
||||
|
||||
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
|
||||
const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.restore_switchable, 3);
|
||||
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
|
||||
DAV1D_RESTORATION_WIENER :
|
||||
|
@ -2692,7 +2692,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
freep(&f->lf.level);
|
||||
freep(&f->frame_thread.b);
|
||||
f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
|
||||
f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
|
||||
// over-allocate by 3 bytes since some of the SIMD implementations
|
||||
// index this from the level type and can thus over-read by up to 3
|
||||
f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *
|
||||
sizeof(*f->lf.level));
|
||||
if (!f->lf.mask || !f->lf.level) goto error;
|
||||
if (c->n_fc > 1) {
|
||||
|
|
|
@ -45,7 +45,7 @@ typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
|
|||
static void NOINLINE
|
||||
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
||||
coef *const coeff, const int eob,
|
||||
const int w, const int h, const int shift1, const int shift2,
|
||||
const int w, const int h, const int shift,
|
||||
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
|
||||
const int has_dconly HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
|
@ -53,8 +53,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
|
||||
const int is_rect2 = w * 2 == h || h * 2 == w;
|
||||
const int bitdepth = bitdepth_from_max(bitdepth_max);
|
||||
const int rnd1 = (1 << shift1) >> 1;
|
||||
const int rnd2 = (1 << shift2) >> 1;
|
||||
const int rnd = (1 << shift) >> 1;
|
||||
|
||||
if (has_dconly && eob == 0) {
|
||||
int dc = coeff[0];
|
||||
|
@ -62,9 +61,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
if (is_rect2)
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc + rnd1) >> shift1;
|
||||
dc = (dc + rnd) >> shift;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc + rnd2) >> shift2;
|
||||
dc = (dc + 8) >> 4;
|
||||
for (j = 0; j < h; j++)
|
||||
for (i = 0; i < w; i++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
|
@ -93,9 +92,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
for (j = 0; j < w; j++)
|
||||
#if BITDEPTH == 8
|
||||
tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
|
||||
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
|
||||
#else
|
||||
tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
|
||||
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
|
||||
-col_clip_max - 1, col_clip_max);
|
||||
#endif
|
||||
}
|
||||
|
@ -106,12 +105,12 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
for (j = 0; j < h; j++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
|
||||
((out[j] + (rnd2)) >> shift2));
|
||||
((out[j] + 8) >> 4));
|
||||
}
|
||||
memset(coeff, 0, sizeof(*coeff) * sh * sw);
|
||||
}
|
||||
|
||||
#define inv_txfm_fn(type1, type2, w, h, shift1, shift2, has_dconly) \
|
||||
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
|
||||
static void \
|
||||
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
||||
const ptrdiff_t stride, \
|
||||
|
@ -119,57 +118,57 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
|||
const int eob \
|
||||
HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
|
||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
|
||||
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
|
||||
HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
#define inv_txfm_fn64(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(dct, dct, w, h, shift1, shift2, 1)
|
||||
#define inv_txfm_fn64(w, h, shift) \
|
||||
inv_txfm_fn(dct, dct, w, h, shift, 1)
|
||||
|
||||
#define inv_txfm_fn32(w, h, shift1, shift2) \
|
||||
inv_txfm_fn64(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(identity, identity, w, h, shift1, shift2, 0)
|
||||
#define inv_txfm_fn32(w, h, shift) \
|
||||
inv_txfm_fn64(w, h, shift) \
|
||||
inv_txfm_fn(identity, identity, w, h, shift, 0)
|
||||
|
||||
#define inv_txfm_fn16(w, h, shift1, shift2) \
|
||||
inv_txfm_fn32(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(adst, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(identity, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, identity, w, h, shift1, shift2, 0) \
|
||||
#define inv_txfm_fn16(w, h, shift) \
|
||||
inv_txfm_fn32(w, h, shift) \
|
||||
inv_txfm_fn(adst, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(identity, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, identity, w, h, shift, 0) \
|
||||
|
||||
#define inv_txfm_fn84(w, h, shift1, shift2) \
|
||||
inv_txfm_fn16(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(identity, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, identity, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(identity, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, identity, w, h, shift1, shift2, 0) \
|
||||
#define inv_txfm_fn84(w, h, shift) \
|
||||
inv_txfm_fn16(w, h, shift) \
|
||||
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
|
||||
inv_txfm_fn(identity, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, identity, w, h, shift, 0) \
|
||||
|
||||
inv_txfm_fn84( 4, 4, 0, 4)
|
||||
inv_txfm_fn84( 4, 8, 0, 4)
|
||||
inv_txfm_fn84( 4, 16, 1, 4)
|
||||
inv_txfm_fn84( 8, 4, 0, 4)
|
||||
inv_txfm_fn84( 8, 8, 1, 4)
|
||||
inv_txfm_fn84( 8, 16, 1, 4)
|
||||
inv_txfm_fn32( 8, 32, 2, 4)
|
||||
inv_txfm_fn84(16, 4, 1, 4)
|
||||
inv_txfm_fn84(16, 8, 1, 4)
|
||||
inv_txfm_fn16(16, 16, 2, 4)
|
||||
inv_txfm_fn32(16, 32, 1, 4)
|
||||
inv_txfm_fn64(16, 64, 2, 4)
|
||||
inv_txfm_fn32(32, 8, 2, 4)
|
||||
inv_txfm_fn32(32, 16, 1, 4)
|
||||
inv_txfm_fn32(32, 32, 2, 4)
|
||||
inv_txfm_fn64(32, 64, 1, 4)
|
||||
inv_txfm_fn64(64, 16, 2, 4)
|
||||
inv_txfm_fn64(64, 32, 1, 4)
|
||||
inv_txfm_fn64(64, 64, 2, 4)
|
||||
inv_txfm_fn84( 4, 4, 0)
|
||||
inv_txfm_fn84( 4, 8, 0)
|
||||
inv_txfm_fn84( 4, 16, 1)
|
||||
inv_txfm_fn84( 8, 4, 0)
|
||||
inv_txfm_fn84( 8, 8, 1)
|
||||
inv_txfm_fn84( 8, 16, 1)
|
||||
inv_txfm_fn32( 8, 32, 2)
|
||||
inv_txfm_fn84(16, 4, 1)
|
||||
inv_txfm_fn84(16, 8, 1)
|
||||
inv_txfm_fn16(16, 16, 2)
|
||||
inv_txfm_fn32(16, 32, 1)
|
||||
inv_txfm_fn64(16, 64, 2)
|
||||
inv_txfm_fn32(32, 8, 2)
|
||||
inv_txfm_fn32(32, 16, 1)
|
||||
inv_txfm_fn32(32, 32, 2)
|
||||
inv_txfm_fn64(32, 64, 1)
|
||||
inv_txfm_fn64(64, 16, 2)
|
||||
inv_txfm_fn64(64, 32, 1)
|
||||
inv_txfm_fn64(64, 64, 2)
|
||||
|
||||
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
|
||||
coef *const coeff, const int eob
|
||||
|
|
|
@ -53,6 +53,7 @@ typedef struct Dav1dLoopFilterDSPContext {
|
|||
} Dav1dLoopFilterDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_LOOPFILTER_H */
|
||||
|
|
|
@ -250,7 +250,11 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
|
|||
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
|
||||
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
|
||||
|
||||
#if HAVE_ASM && ARCH_X86
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_loop_filter_dsp_init_arm)(c);
|
||||
#elif ARCH_X86
|
||||
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -86,12 +86,14 @@ if is_asm_enabled
|
|||
)
|
||||
libdav1d_tmpl_sources += files(
|
||||
'arm/cdef_init_tmpl.c',
|
||||
'arm/loopfilter_init_tmpl.c',
|
||||
'arm/looprestoration_init_tmpl.c',
|
||||
'arm/mc_init_tmpl.c',
|
||||
)
|
||||
if host_machine.cpu_family() == 'aarch64'
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/loopfilter.S',
|
||||
'arm/64/looprestoration.S',
|
||||
'arm/64/mc.S',
|
||||
)
|
||||
|
@ -118,19 +120,30 @@ if is_asm_enabled
|
|||
|
||||
# NASM source files
|
||||
libdav1d_sources_asm = files(
|
||||
'x86/cdef.asm',
|
||||
'x86/cdef_ssse3.asm',
|
||||
'x86/cpuid.asm',
|
||||
'x86/msac.asm',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources_asm += files(
|
||||
'x86/cdef.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/ipred.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/itx.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/loopfilter.asm',
|
||||
'x86/looprestoration.asm',
|
||||
'x86/looprestoration_ssse3.asm',
|
||||
'x86/mc.asm',
|
||||
'x86/mc_ssse3.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
)
|
||||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources_asm += files(
|
||||
)
|
||||
endif
|
||||
|
||||
# Compile the ASM sources with NASM
|
||||
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
|
@ -139,8 +152,10 @@ endif
|
|||
|
||||
|
||||
|
||||
api_export_flags = []
|
||||
|
||||
#
|
||||
# Windows .rc file
|
||||
# Windows .rc file and API export flags
|
||||
#
|
||||
|
||||
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
||||
|
@ -162,6 +177,8 @@ if host_machine.system() == 'windows' and get_option('default_library') != 'stat
|
|||
)
|
||||
|
||||
libdav1d_rc_obj = winmod.compile_resources(rc_file)
|
||||
|
||||
api_export_flags = ['-DDAV1D_BUILDING_DLL']
|
||||
else
|
||||
libdav1d_rc_obj = []
|
||||
endif
|
||||
|
@ -180,7 +197,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
|
|||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
dependencies: [stdatomic_dependency],
|
||||
c_args : [stackalign_flag, stackrealign_flag],
|
||||
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
|
@ -222,7 +239,7 @@ libdav1d = library('dav1d',
|
|||
thread_dependency,
|
||||
thread_compat_dep,
|
||||
],
|
||||
c_args : [stackalign_flag],
|
||||
c_args : [stackalign_flag, api_export_flags],
|
||||
version : dav1d_soname_version,
|
||||
soversion : dav1d_soversion,
|
||||
install : true,
|
||||
|
|
|
@ -58,8 +58,8 @@ static inline void ctx_refill(MsacContext *s) {
|
|||
* necessary), and stores them back in the decoder context.
|
||||
* dif: The new value of dif.
|
||||
* rng: The new value of the range. */
|
||||
static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
|
||||
const uint16_t d = 15 - (31 ^ clz(rng));
|
||||
static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
|
||||
const int d = 15 ^ (31 ^ clz(rng));
|
||||
assert(rng <= 65535U);
|
||||
s->cnt -= d;
|
||||
s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
|
||||
|
@ -69,18 +69,17 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
|
|||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
|
||||
ec_win v, vw, dif = s->dif;
|
||||
uint16_t r = s->rng;
|
||||
unsigned ret;
|
||||
ec_win vw, dif = s->dif;
|
||||
unsigned ret, v, r = s->rng;
|
||||
assert((dif >> (EC_WIN_SIZE - 16)) < r);
|
||||
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
|
||||
// replace the multiply with a simple shift.
|
||||
v = ((r >> 8) << 7) + EC_MIN_PROB;
|
||||
vw = v << (EC_WIN_SIZE - 16);
|
||||
vw = (ec_win)v << (EC_WIN_SIZE - 16);
|
||||
ret = dif >= vw;
|
||||
dif -= ret*vw;
|
||||
v += ret*(r - 2*v);
|
||||
ctx_norm(s, dif, (unsigned) v);
|
||||
ctx_norm(s, dif, v);
|
||||
return !ret;
|
||||
}
|
||||
|
||||
|
@ -88,59 +87,57 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
|
|||
* f: The probability that the bit is one
|
||||
* Return: The value decoded (0 or 1). */
|
||||
unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
|
||||
ec_win v, vw, dif = s->dif;
|
||||
uint16_t r = s->rng;
|
||||
unsigned ret;
|
||||
ec_win vw, dif = s->dif;
|
||||
unsigned ret, v, r = s->rng;
|
||||
assert((dif >> (EC_WIN_SIZE - 16)) < r);
|
||||
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
|
||||
vw = v << (EC_WIN_SIZE - 16);
|
||||
vw = (ec_win)v << (EC_WIN_SIZE - 16);
|
||||
ret = dif >= vw;
|
||||
dif -= ret*vw;
|
||||
v += ret*(r - 2*v);
|
||||
ctx_norm(s, dif, (unsigned) v);
|
||||
ctx_norm(s, dif, v);
|
||||
return !ret;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
|
||||
int v = 0;
|
||||
for (int n = (int) l - 1; n >= 0; n--)
|
||||
v = (v << 1) | dav1d_msac_decode_bool_equi(c);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
|
||||
unsigned v = 0;
|
||||
while (n--)
|
||||
v = (v << 1) | dav1d_msac_decode_bool_equi(s);
|
||||
return v;
|
||||
}
|
||||
|
||||
int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
|
||||
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
|
||||
const int n, const unsigned k)
|
||||
{
|
||||
int i = 0;
|
||||
int a = 0;
|
||||
int b = k;
|
||||
while ((2 << b) < n) {
|
||||
if (!dav1d_msac_decode_bool_equi(c)) break;
|
||||
if (!dav1d_msac_decode_bool_equi(s)) break;
|
||||
b = k + i++;
|
||||
a = (1 << b);
|
||||
}
|
||||
const unsigned v = dav1d_msac_decode_bools(c, b) + a;
|
||||
const unsigned v = dav1d_msac_decode_bools(s, b) + a;
|
||||
return ref * 2 <= n ? inv_recenter(ref, v) :
|
||||
n - 1 - inv_recenter(n - 1 - ref, v);
|
||||
}
|
||||
|
||||
int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
|
||||
int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
|
||||
assert(n > 0);
|
||||
const int l = ulog2(n) + 1;
|
||||
assert(l > 1);
|
||||
const unsigned m = (1 << l) - n;
|
||||
const unsigned v = dav1d_msac_decode_bools(c, l - 1);
|
||||
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
|
||||
const unsigned v = dav1d_msac_decode_bools(s, l - 1);
|
||||
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
|
||||
}
|
||||
|
||||
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
|
||||
* table in Q15. */
|
||||
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
|
||||
const unsigned n_symbols)
|
||||
const size_t n_symbols)
|
||||
{
|
||||
ec_win u, v = s->rng, r = s->rng >> 8;
|
||||
const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
|
||||
unsigned ret = 0;
|
||||
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
|
||||
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
|
||||
|
||||
assert(!cdf[n_symbols - 1]);
|
||||
|
||||
|
@ -148,18 +145,21 @@ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
|
|||
u = v;
|
||||
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
|
||||
v >>= 7 - EC_PROB_SHIFT;
|
||||
v += EC_MIN_PROB * (n_symbols - ret);
|
||||
v += EC_MIN_PROB * (int) (n_symbols - ret);
|
||||
} while (c < v);
|
||||
|
||||
assert(u <= s->rng);
|
||||
|
||||
ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
|
||||
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
|
||||
return ret - 1;
|
||||
}
|
||||
|
||||
static void update_cdf(uint16_t *const cdf, const unsigned val,
|
||||
const unsigned n_symbols)
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
|
||||
uint16_t *const cdf,
|
||||
const size_t n_symbols)
|
||||
{
|
||||
const unsigned val = decode_symbol(s, cdf, n_symbols);
|
||||
if (s->allow_update_cdf) {
|
||||
const unsigned count = cdf[n_symbols];
|
||||
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
|
||||
unsigned i;
|
||||
|
@ -168,24 +168,16 @@ static void update_cdf(uint16_t *const cdf, const unsigned val,
|
|||
for (; i < n_symbols - 1; i++)
|
||||
cdf[i] -= cdf[i] >> rate;
|
||||
cdf[n_symbols] = count + (count < 32);
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
|
||||
uint16_t *const cdf,
|
||||
const unsigned n_symbols)
|
||||
{
|
||||
const unsigned val = decode_symbol(c, cdf, n_symbols);
|
||||
if(c->allow_update_cdf)
|
||||
update_cdf(cdf, val, n_symbols);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
|
||||
uint16_t *const cdf)
|
||||
{
|
||||
const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
|
||||
const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
|
||||
|
||||
if(c->allow_update_cdf){
|
||||
if (s->allow_update_cdf) {
|
||||
// update_cdf() specialized for boolean CDFs
|
||||
const unsigned count = cdf[1];
|
||||
const int rate = (count >> 4) | 4;
|
||||
|
|
|
@ -38,20 +38,37 @@ typedef struct MsacContext {
|
|||
const uint8_t *buf_pos;
|
||||
const uint8_t *buf_end;
|
||||
ec_win dif;
|
||||
uint16_t rng;
|
||||
unsigned rng;
|
||||
int cnt;
|
||||
int allow_update_cdf;
|
||||
} MsacContext;
|
||||
|
||||
void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
|
||||
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
|
||||
int disable_cdf_update_flag);
|
||||
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
|
||||
const unsigned n_symbols);
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
|
||||
unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
|
||||
int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
|
||||
int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
|
||||
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
|
||||
int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
|
||||
|
||||
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
|
||||
#else
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
|
||||
#endif
|
||||
|
||||
#endif /* DAV1D_SRC_MSAC_H */
|
||||
|
|
|
@ -107,7 +107,9 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
uint16_t *const txtp_cdf = intra ?
|
||||
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
|
||||
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
|
||||
idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
|
||||
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
|
||||
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
|
||||
|
||||
if (dbg)
|
||||
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
|
||||
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
|
||||
|
@ -122,19 +124,19 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
|
||||
const int is_1d = tx_class != TX_CLASS_2D;
|
||||
switch (tx2dszctx) {
|
||||
#define case_sz(sz, bin) \
|
||||
#define case_sz(sz, bin, ns) \
|
||||
case sz: { \
|
||||
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
|
||||
break; \
|
||||
}
|
||||
case_sz(0, 16);
|
||||
case_sz(1, 32);
|
||||
case_sz(2, 64);
|
||||
case_sz(3, 128);
|
||||
case_sz(4, 256);
|
||||
case_sz(5, 512);
|
||||
case_sz(6, 1024);
|
||||
case_sz(0, 16, 4);
|
||||
case_sz(1, 32, 8);
|
||||
case_sz(2, 64, 8);
|
||||
case_sz(3, 128, 8);
|
||||
case_sz(4, 256, 16);
|
||||
case_sz(5, 512, 16);
|
||||
case_sz(6, 1024, 16);
|
||||
#undef case_sz
|
||||
}
|
||||
if (dbg)
|
||||
|
@ -179,7 +181,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
uint16_t *const lo_cdf = is_last ?
|
||||
ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
|
||||
ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
|
||||
int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
|
||||
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
|
||||
4 - is_last) + is_last;
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
|
@ -190,7 +192,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
if (tok == 3) {
|
||||
const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[br_ctx], 4);
|
||||
if (dbg)
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
|
||||
|
|
|
@ -113,7 +113,7 @@ SECTION .text
|
|||
paddw m15, m5
|
||||
%endmacro
|
||||
|
||||
%macro cdef_filter_fn 3 ; w, h, stride
|
||||
%macro CDEF_FILTER 3 ; w, h, stride
|
||||
INIT_YMM avx2
|
||||
%if %1 != 4 || %2 != 8
|
||||
cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
|
||||
|
@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
lea dst4q, [dstq+strideq*4]
|
||||
%endif
|
||||
lea stride3q, [strideq*3]
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .no_right
|
||||
pmovzxbw m1, [dstq+strideq*0]
|
||||
pmovzxbw m2, [dstq+strideq*1]
|
||||
|
@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; top
|
||||
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
|
||||
test edged, 4 ; have_top
|
||||
test edgeb, 4 ; have_top
|
||||
jz .no_top
|
||||
mov top1q, [top2q+0*gprsize]
|
||||
mov top2q, [top2q+1*gprsize]
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .top_no_left
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .top_no_right
|
||||
pmovzxbw m1, [top1q-(%1/2)]
|
||||
pmovzxbw m2, [top2q-(%1/2)]
|
||||
|
@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
movd [px-1*%3+%1*2], xm14
|
||||
jmp .top_done
|
||||
.top_no_left:
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .top_no_left_right
|
||||
pmovzxbw m1, [top1q]
|
||||
pmovzxbw m2, [top2q]
|
||||
|
@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
.top_done:
|
||||
|
||||
; left
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .no_left
|
||||
pmovzxbw xm1, [leftq+ 0]
|
||||
%if %2 == 8
|
||||
|
@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; bottom
|
||||
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
|
||||
test edged, 8 ; have_bottom
|
||||
test edgeb, 8 ; have_bottom
|
||||
jz .no_bottom
|
||||
lea dst8q, [dstq+%2*strideq]
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .bottom_no_left
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .bottom_no_right
|
||||
pmovzxbw m1, [dst8q-(%1/2)]
|
||||
pmovzxbw m2, [dst8q+strideq-(%1/2)]
|
||||
|
@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
movd [px+(%2+1)*%3+%1*2], xm14
|
||||
jmp .bottom_done
|
||||
.bottom_no_left:
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .bottom_no_left_right
|
||||
pmovzxbw m1, [dst8q]
|
||||
pmovzxbw m2, [dst8q+strideq]
|
||||
|
@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; actual filter
|
||||
INIT_YMM avx2
|
||||
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
|
||||
DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
|
||||
%undef edged
|
||||
; register to shuffle values into after packing
|
||||
vbroadcasti128 m12, [shufb_lohi]
|
||||
|
||||
movifnidn prid, prim
|
||||
movifnidn secd, secm
|
||||
mov dampingd, r7m
|
||||
|
||||
mov pridmpd, prid
|
||||
mov secdmpd, secd
|
||||
or pridmpd, 1
|
||||
or secdmpd, 1
|
||||
lzcnt pridmpd, pridmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
lea pridmpd, [pridmpd+dampingd-31]
|
||||
lea secdmpd, [secdmpd+dampingd-31]
|
||||
xor dampingd, dampingd
|
||||
test pridmpd, pridmpd
|
||||
cmovl pridmpd, dampingd
|
||||
test secdmpd, secdmpd
|
||||
cmovl secdmpd, dampingd
|
||||
lzcnt pridmpd, prid
|
||||
%if UNIX64
|
||||
movd xm0, prid
|
||||
movd xm1, secdmpd
|
||||
%endif
|
||||
lzcnt secdmpd, secdmpm
|
||||
sub dampingd, 31
|
||||
xor zerod, zerod
|
||||
add pridmpd, dampingd
|
||||
cmovl pridmpd, zerod
|
||||
add secdmpd, dampingd
|
||||
cmovl secdmpd, zerod
|
||||
mov [rsp+0], pridmpq ; pri_shift
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
|
||||
lea tableq, [tap_table]
|
||||
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
|
||||
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
|
||||
|
||||
; pri/sec_taps[k] [4 total]
|
||||
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
|
||||
movd xm0, prid
|
||||
movd xm1, secd
|
||||
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
|
||||
%if UNIX64
|
||||
vpbroadcastb m0, xm0 ; pri_strength
|
||||
vpbroadcastb m1, xm1 ; sec_strength
|
||||
%else
|
||||
vpbroadcastb m0, prim
|
||||
vpbroadcastb m1, secm
|
||||
%endif
|
||||
and prid, 1
|
||||
lea priq, [tableq+priq*2+8] ; pri_taps
|
||||
lea secq, [tableq+12] ; sec_taps
|
||||
|
||||
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
|
||||
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
|
||||
mov dird, r6m
|
||||
lea dirq, [tapq+dirq*2+14]
|
||||
lea dirq, [tableq+dirq*2+14]
|
||||
%if %1*%2*2/mmsize > 1
|
||||
%if %1 == 4
|
||||
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
|
||||
|
@ -476,9 +475,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
RET
|
||||
%endmacro
|
||||
|
||||
cdef_filter_fn 8, 8, 32
|
||||
cdef_filter_fn 4, 8, 32
|
||||
cdef_filter_fn 4, 4, 32
|
||||
CDEF_FILTER 8, 8, 32
|
||||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
||||
|
@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
paddw m11, m13 ; partial_sum_alt[3/2] right
|
||||
vbroadcasti128 m13, [div_table+32]
|
||||
paddw m4, m5 ; partial_sum_alt[3/2] left
|
||||
pshuflw m11, m11, q3012
|
||||
punpckhwd m6, m4, m11
|
||||
punpcklwd m4, m11
|
||||
pshuflw m5, m11, q3012
|
||||
punpckhwd m6, m11, m4
|
||||
punpcklwd m4, m5
|
||||
pmaddwd m6, m6
|
||||
pmaddwd m4, m4
|
||||
pmulld m6, m12
|
||||
|
@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
paddw m6, m7
|
||||
paddw m1, m3 ; partial_sum_alt[0/1] right
|
||||
paddw m5, m6 ; partial_sum_alt[0/1] left
|
||||
pshuflw m1, m1, q3012
|
||||
punpckhwd m6, m5, m1
|
||||
punpcklwd m5, m1
|
||||
pmaddwd m6, m6
|
||||
pshuflw m0, m1, q3012
|
||||
punpckhwd m1, m5
|
||||
punpcklwd m5, m0
|
||||
pmaddwd m1, m1
|
||||
pmaddwd m5, m5
|
||||
pmulld m6, m12
|
||||
pmulld m1, m12
|
||||
pmulld m5, m13
|
||||
paddd m5, m6 ; cost1[a-d] | cost3[a-d]
|
||||
paddd m5, m1 ; cost1[a-d] | cost3[a-d]
|
||||
|
||||
mova xm0, [pd_47130256+ 16]
|
||||
mova m1, [pd_47130256]
|
||||
|
@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
|
||||
; now find the best cost
|
||||
pmaxsd xm2, xm0, xm1
|
||||
pshufd xm3, xm2, q3232
|
||||
pshufd xm3, xm2, q1032
|
||||
pmaxsd xm2, xm3
|
||||
pshufd xm3, xm2, q1111
|
||||
pmaxsd xm2, xm3
|
||||
pshufd xm2, xm2, q0000 ; best cost
|
||||
pshufd xm3, xm2, q2301
|
||||
pmaxsd xm2, xm3 ; best cost
|
||||
|
||||
; find the idx using minpos
|
||||
; make everything other than the best cost negative via subtraction
|
||||
|
@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
phminposuw xm3, xm3
|
||||
|
||||
; convert idx to 32-bits
|
||||
psrldq xm3, 2
|
||||
psrld xm3, 16
|
||||
movd eax, xm3
|
||||
|
||||
; get idx^4 complement
|
||||
|
|
|
@ -29,15 +29,19 @@
|
|||
#include "src/cdef.h"
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
|
||||
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
|
||||
|
||||
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
||||
|
@ -45,13 +49,22 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH ==8
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_ssse3;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_sse4;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
|
|
|
@ -31,16 +31,26 @@ SECTION_RODATA 16
|
|||
|
||||
%if ARCH_X86_32
|
||||
pb_0: times 16 db 0
|
||||
pb_0xFF: times 16 db 0xFF
|
||||
%endif
|
||||
pw_128: times 8 dw 128
|
||||
pw_256: times 8 dw 256
|
||||
pw_2048: times 8 dw 2048
|
||||
%if ARCH_X86_32
|
||||
pw_0x7FFF: times 8 dw 0x7FFF
|
||||
pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
|
||||
div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
|
||||
pw_0x8000: times 8 dw 0x8000
|
||||
%endif
|
||||
div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
|
||||
dd 420, 210, 140, 105, 105, 105, 105, 105
|
||||
div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
|
||||
dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
|
||||
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
|
||||
tap_table: dw 4, 2, 3, 3, 2, 1
|
||||
tap_table: ; masks for 8-bit shift emulation
|
||||
db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
|
||||
; weights
|
||||
db 4, 2, 3, 3, 2, 1
|
||||
; taps indices
|
||||
db -1 * 16 + 1, -2 * 16 + 2
|
||||
db 0 * 16 + 1, -1 * 16 + 2
|
||||
db 0 * 16 + 1, 0 * 16 + 2
|
||||
|
@ -59,8 +69,6 @@ tap_table: dw 4, 2, 3, 3, 2, 1
|
|||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM ssse3
|
||||
|
||||
%macro movif32 2
|
||||
%if ARCH_X86_32
|
||||
mov %1, %2
|
||||
|
@ -111,23 +119,32 @@ INIT_XMM ssse3
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride
|
||||
%if ARCH_X86_64
|
||||
%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
|
||||
; load p0/p1
|
||||
movsx offq, byte [dirq+kq+%1] ; off1
|
||||
%if %5 == 4
|
||||
movq m5, [stkq+offq*2+%6*0] ; p0
|
||||
movhps m5, [stkq+offq*2+%6*1]
|
||||
%if %6 == 4
|
||||
movq m5, [stkq+offq*2+%7*0] ; p0
|
||||
movhps m5, [stkq+offq*2+%7*1]
|
||||
%else
|
||||
movu m5, [stkq+offq*2+%6*0] ; p0
|
||||
movu m5, [stkq+offq*2+%7*0] ; p0
|
||||
%endif
|
||||
neg offq ; -off1
|
||||
%if %5 == 4
|
||||
movq m6, [stkq+offq*2+%6*0] ; p1
|
||||
movhps m6, [stkq+offq*2+%6*1]
|
||||
%if %6 == 4
|
||||
movq m6, [stkq+offq*2+%7*0] ; p1
|
||||
movhps m6, [stkq+offq*2+%7*1]
|
||||
%else
|
||||
movu m6, [stkq+offq*2+%6*0] ; p1
|
||||
movu m6, [stkq+offq*2+%7*0] ; p1
|
||||
%endif
|
||||
%if cpuflag(sse4)
|
||||
; out of bounds values are set to a value that is a both a large unsigned
|
||||
; value and a negative signed value.
|
||||
; use signed max and unsigned min to remove them
|
||||
pmaxsw m7, m5
|
||||
pminuw m8, m5
|
||||
pmaxsw m7, m6
|
||||
pminuw m8, m6
|
||||
%else
|
||||
%if ARCH_X86_64
|
||||
pcmpeqw m9, m14, m5
|
||||
pcmpeqw m10, m14, m6
|
||||
pandn m9, m5
|
||||
|
@ -136,77 +153,42 @@ INIT_XMM ssse3
|
|||
pminsw m8, m5 ; min after p0
|
||||
pmaxsw m7, m10 ; max after p1
|
||||
pminsw m8, m6 ; min after p1
|
||||
%else
|
||||
pcmpeqw m3, m5, OUT_OF_BOUNDS_MEM
|
||||
pandn m3, m5
|
||||
pmaxsw m7, m3 ; max after p0
|
||||
pminsw m8, m5 ; min after p0
|
||||
pcmpeqw m3, m6, OUT_OF_BOUNDS_MEM
|
||||
pandn m3, m6
|
||||
pmaxsw m7, m3 ; max after p1
|
||||
pminsw m8, m6 ; min after p1
|
||||
%endif
|
||||
%endif
|
||||
|
||||
; accumulate sum[m13] over p0/p1
|
||||
psubw m5, m4 ; diff_p0(p0 - px)
|
||||
psubw m6, m4 ; diff_p1(p1 - px)
|
||||
pabsw m9, m5
|
||||
pabsw m10, m6
|
||||
mova m12, m9
|
||||
psrlw m9, %2
|
||||
psignw m11, %4, m5
|
||||
psubusw m5, %3, m9
|
||||
mova m9, m10
|
||||
pminsw m5, m12 ; constrain(diff_p0)
|
||||
psrlw m10, %2
|
||||
psignw m12, %4, m6
|
||||
psubusw m6, %3, m10
|
||||
pmullw m5, m11 ; constrain(diff_p0) * taps
|
||||
pminsw m6, m9 ; constrain(diff_p1)
|
||||
pmullw m6, m12 ; constrain(diff_p1) * taps
|
||||
paddw m13, m5
|
||||
paddw m13, m6
|
||||
packsswb m5, m6 ; convert pixel diff to 8-bit
|
||||
%if ARCH_X86_64 && cpuflag(sse4)
|
||||
pshufb m5, m14 ; group diffs p0 and p1 into pairs
|
||||
%else
|
||||
; load p0
|
||||
movsx offq, byte [dirq+kq+%1] ; off1
|
||||
%if %5 == 4
|
||||
movq m5, [stkq+offq*2+%6*0] ; p0
|
||||
movhps m5, [stkq+offq*2+%6*1]
|
||||
pshufb m5, [PIC_sym(shufb_lohi)]
|
||||
%endif
|
||||
pabsb m6, m5
|
||||
psignb m9, %5, m5
|
||||
%if ARCH_X86_64
|
||||
psrlw m10, m6, %2 ; emulate 8-bit shift
|
||||
pand m10, %3
|
||||
psubusb m5, %4, m10
|
||||
%else
|
||||
movu m5, [stkq+offq*2+%6*0] ; p0
|
||||
psrlw m5, m6, %2 ; emulate 8-bit shift
|
||||
pand m5, %3
|
||||
paddusb m5, %4
|
||||
pxor m5, [PIC_sym(pb_0xFF)]
|
||||
%endif
|
||||
pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)]
|
||||
pandn m3, m5
|
||||
pmaxsw m7, m3 ; max after p0
|
||||
pminsw m8, m5 ; min after p0
|
||||
|
||||
; accumulate sum[m7] over p0
|
||||
psubw m5, m4 ; diff_p0(p0 - px)
|
||||
psignw m6, %4, m5 ; constrain(diff_p0)
|
||||
pabsw m5, m5
|
||||
mova m3, m5
|
||||
psrlw m5, %2
|
||||
paddsw m5, %3
|
||||
pandn m5, [PIC_sym(pw_0x7FFF)]
|
||||
pminsw m5, m3
|
||||
pmullw m5, m6 ; constrain(diff_p0) * taps
|
||||
pminub m5, m6 ; constrain(diff_p)
|
||||
pmaddubsw m5, m9 ; constrain(diff_p) * taps
|
||||
paddw m13, m5
|
||||
|
||||
; load p1
|
||||
neg offq ; -off1
|
||||
%if %5 == 4
|
||||
movq m5, [stkq+offq*2+%6*0] ; p1
|
||||
movhps m5, [stkq+offq*2+%6*1]
|
||||
%else
|
||||
movu m5, [stkq+offq*2+%6*0] ; p1
|
||||
%endif
|
||||
pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)]
|
||||
pandn m3, m5
|
||||
pmaxsw m7, m3 ; max after p1
|
||||
pminsw m8, m5 ; min after p1
|
||||
|
||||
; accumulate sum[m7] over p1
|
||||
psubw m5, m4 ; diff_p1(p1 - px)
|
||||
psignw m6, %4, m5 ; constrain(diff_p1)
|
||||
pabsw m5, m5
|
||||
mova m3, m5
|
||||
psrlw m5, %2
|
||||
paddsw m5, %3
|
||||
pandn m5, [PIC_sym(pw_0x7FFF)]
|
||||
pminsw m5, m3
|
||||
pmullw m5, m6 ; constrain(diff_p1) * taps
|
||||
paddw m13, m5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PMOVZXBW 2-3 0 ; %3 = half
|
||||
|
@ -250,17 +232,28 @@ INIT_XMM ssse3
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro cdef_filter_fn 3 ; w, h, stride
|
||||
%macro CDEF_FILTER 3 ; w, h, stride
|
||||
|
||||
%if cpuflag(sse4)
|
||||
%define OUT_OF_BOUNDS 0x80008000
|
||||
%else
|
||||
%define OUT_OF_BOUNDS 0x7FFF7FFF
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
|
||||
dst, stride, left, top, pri, sec, stride3, dst4, edge
|
||||
pcmpeqw m14, m14
|
||||
%if cpuflag(sse4)
|
||||
psllw m14, 15 ; 0x8000
|
||||
%else
|
||||
psrlw m14, 1 ; 0x7FFF
|
||||
%endif
|
||||
pxor m15, m15
|
||||
|
||||
%define px rsp+3*16+2*%3
|
||||
%else
|
||||
cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
||||
cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
||||
dst, stride, left, top, stride3, dst4, edge
|
||||
SAVE_ARG left, 2
|
||||
SAVE_ARG top, 3
|
||||
|
@ -272,9 +265,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
%define PIC_reg r2
|
||||
LEA PIC_reg, PIC_base_offset
|
||||
|
||||
%if cpuflag(sse4)
|
||||
%define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
|
||||
%else
|
||||
%define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
|
||||
%endif
|
||||
|
||||
%define m15 [PIC_sym(pb_0)]
|
||||
|
||||
%define px esp+5*16+2*%3
|
||||
%define px esp+7*16+2*%3
|
||||
%endif
|
||||
|
||||
mov edged, r8m
|
||||
|
@ -311,15 +310,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
mova [px+5*%3], m5
|
||||
mova [px+6*%3], m6
|
||||
mova [px+7*%3], m7
|
||||
mov dword [px+4*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+5*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+6*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+7*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
|
||||
%endif
|
||||
mov dword [px+0*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+1*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+2*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+3*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
|
||||
.body_done:
|
||||
|
||||
; top
|
||||
|
@ -371,8 +370,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
mova [px-1*%3-8*2], m1
|
||||
mova [px-1*%3-0*2], m3
|
||||
%endif
|
||||
mov dword [px-2*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px-1*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
|
||||
jmp .top_done
|
||||
.top_no_left:
|
||||
test edged, 2 ; have_right
|
||||
|
@ -392,24 +391,24 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
%endif
|
||||
mova [px-2*%3], m0
|
||||
mova [px-1*%3], m1
|
||||
mov dword [px-2*%3-4], 0x7FFF7FFF
|
||||
mov dword [px-1*%3-4], 0x7FFF7FFF
|
||||
mov dword [px-2*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px-1*%3-4], OUT_OF_BOUNDS
|
||||
jmp .top_done
|
||||
.top_no_left_right:
|
||||
PMOVZXBW m0, [top1q], %1 == 4
|
||||
PMOVZXBW m1, [top2q], %1 == 4
|
||||
mova [px-2*%3], m0
|
||||
mova [px-1*%3], m1
|
||||
mov dword [px-2*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px-1*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px-2*%3-4], 0X7FFF7FFF
|
||||
mov dword [px-1*%3-4], 0X7FFF7FFF
|
||||
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px-2*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px-1*%3-4], OUT_OF_BOUNDS
|
||||
jmp .top_done
|
||||
.no_top:
|
||||
%if ARCH_X86_64
|
||||
SWAP m0, m14
|
||||
%else
|
||||
mova m0, [PIC_sym(pw_0x7FFF)]
|
||||
mova m0, OUT_OF_BOUNDS_MEM
|
||||
%endif
|
||||
movu [px-2*%3-4], m0
|
||||
movu [px-1*%3-4], m0
|
||||
|
@ -455,15 +454,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
movd [px+3*%3-4], m2
|
||||
jmp .left_done
|
||||
.no_left:
|
||||
mov dword [px+0*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+1*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+2*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+3*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+0*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+1*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+2*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+3*%3-4], OUT_OF_BOUNDS
|
||||
%if %2 == 8
|
||||
mov dword [px+4*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+5*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+6*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+7*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+4*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+5*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+6*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+7*%3-4], OUT_OF_BOUNDS
|
||||
%endif
|
||||
.left_done:
|
||||
|
||||
|
@ -513,10 +512,10 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
mova [px+(%2+0)*%3-0*2], m2
|
||||
mova [px+(%2+1)*%3-8*2], m1
|
||||
mova [px+(%2+1)*%3-0*2], m3
|
||||
mov dword [px+(%2-1)*%3+8*2], 0x7FFF7FFF ; overwritten by first mova
|
||||
mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS ; overwritten by first mova
|
||||
%endif
|
||||
mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
|
||||
jmp .bottom_done
|
||||
.bottom_no_left:
|
||||
test edged, 2 ; have_right
|
||||
|
@ -536,24 +535,24 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
%endif
|
||||
mova [px+(%2+0)*%3], m0
|
||||
mova [px+(%2+1)*%3], m1
|
||||
mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
|
||||
jmp .bottom_done
|
||||
.bottom_no_left_right:
|
||||
PMOVZXBW m0, [dst8q+strideq*0], %1 == 4
|
||||
PMOVZXBW m1, [dst8q+strideq*1], %1 == 4
|
||||
mova [px+(%2+0)*%3], m0
|
||||
mova [px+(%2+1)*%3], m1
|
||||
mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF
|
||||
mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF
|
||||
mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
|
||||
mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
|
||||
mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
|
||||
jmp .bottom_done
|
||||
.no_bottom:
|
||||
%if ARCH_X86_64
|
||||
SWAP m0, m14
|
||||
%else
|
||||
mova m0, [PIC_sym(pw_0x7FFF)]
|
||||
mova m0, OUT_OF_BOUNDS_MEM
|
||||
%endif
|
||||
movu [px+(%2+0)*%3-4], m0
|
||||
movu [px+(%2+1)*%3-4], m0
|
||||
|
@ -592,47 +591,74 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
cmovl pridmpd, dampingd
|
||||
neg secdmpd
|
||||
cmovl secdmpd, dampingd
|
||||
%if ARCH_X86_64
|
||||
mov [rsp+ 0], pridmpq ; pri_shift
|
||||
mov [rsp+16], secdmpq ; sec_shift
|
||||
%if ARCH_X86_32
|
||||
mov dword [esp+ 4], 0 ; zero upper 32 bits of psraw
|
||||
mov dword [esp+20], 0 ; source operand in ACCUMULATE_TAP
|
||||
%define PIC_reg r6
|
||||
%else
|
||||
mov [esp+0x00], pridmpd
|
||||
mov [esp+0x30], secdmpd
|
||||
mov dword [esp+0x04], 0 ; zero upper 32 bits of psrlw
|
||||
mov dword [esp+0x34], 0 ; source operand in ACCUMULATE_TAP
|
||||
%define PIC_reg r4
|
||||
LOAD_PIC_REG 8
|
||||
%endif
|
||||
|
||||
; pri/sec_taps[k] [4 total]
|
||||
DEFINE_ARGS dst, stride, tap, dummy, pri, sec
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
|
||||
lea tableq, [PIC_sym(tap_table)]
|
||||
%if ARCH_X86_64
|
||||
mova m14, [pw_256]
|
||||
%else
|
||||
%define m14 [PIC_sym(pw_256)]
|
||||
SWAP m2, m11
|
||||
SWAP m3, m12
|
||||
%endif
|
||||
movd m2, [tableq+pridmpq]
|
||||
movd m3, [tableq+secdmpq]
|
||||
pshufb m2, m15 ; pri_shift_mask
|
||||
pshufb m3, m15 ; sec_shift_mask
|
||||
%if ARCH_X86_64
|
||||
SWAP m2, m11
|
||||
SWAP m3, m12
|
||||
%else
|
||||
%define PIC_reg r6
|
||||
mov PIC_reg, r4
|
||||
DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
|
||||
LOAD_ARG pri
|
||||
LOAD_ARG dir, 1
|
||||
mova [esp+0x10], m2
|
||||
mova [esp+0x40], m3
|
||||
%endif
|
||||
|
||||
; pri/sec_taps[k] [4 total]
|
||||
DEFINE_ARGS dst, stride, dummy, tap, pri, sec
|
||||
movd m0, prid
|
||||
movd m1, secd
|
||||
pshufb m0, m14
|
||||
pshufb m1, m14
|
||||
%if ARCH_X86_32
|
||||
mova m2, [PIC_sym(pw_0x7FFF)]
|
||||
pandn m0, m2
|
||||
pandn m1, m2
|
||||
%if ARCH_X86_64
|
||||
pshufb m0, m15
|
||||
pshufb m1, m15
|
||||
%else
|
||||
mova m2, m15
|
||||
mova m3, [PIC_sym(pb_0xFF)]
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
pxor m0, m3
|
||||
pxor m1, m3
|
||||
mova [esp+0x20], m0
|
||||
mova [esp+0x30], m1
|
||||
mova [esp+0x50], m1
|
||||
%endif
|
||||
and prid, 1
|
||||
lea tapq, [PIC_sym(tap_table)]
|
||||
lea priq, [tapq+priq*4] ; pri_taps
|
||||
lea secq, [tapq+8] ; sec_taps
|
||||
lea priq, [tapq+8+priq*2] ; pri_taps
|
||||
lea secq, [tapq+12] ; sec_taps
|
||||
|
||||
%if ARCH_X86_64 && cpuflag(sse4)
|
||||
mova m14, [shufb_lohi]
|
||||
%endif
|
||||
|
||||
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
|
||||
DEFINE_ARGS dst, stride, tap, dir, pri, sec
|
||||
DEFINE_ARGS dst, stride, dir, tap, pri, sec
|
||||
%if ARCH_X86_64
|
||||
mov dird, r6m
|
||||
lea tapq, [tapq+dirq*2+12]
|
||||
lea dirq, [tapq+14+dirq*2]
|
||||
DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
|
||||
%else
|
||||
LOAD_ARG dir, 1
|
||||
lea tapd, [tapd+dird*2+12]
|
||||
lea dird, [tapd+14+dird*2]
|
||||
DEFINE_ARGS dst, stride, dir, stk, pri, sec
|
||||
%define hd dword [esp+8]
|
||||
%define offq dstq
|
||||
|
@ -640,9 +666,9 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
%endif
|
||||
mov hd, %1*%2*2/mmsize
|
||||
lea stkq, [px]
|
||||
movif32 [esp+0x1C], strided
|
||||
movif32 [esp+0x3C], strided
|
||||
.v_loop:
|
||||
movif32 [esp+0x18], dstd
|
||||
movif32 [esp+0x38], dstd
|
||||
mov kq, 1
|
||||
%if %1 == 4
|
||||
movq m4, [stkq+%3*0]
|
||||
|
@ -652,7 +678,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
%endif
|
||||
|
||||
%if ARCH_X86_32
|
||||
%xdefine m11 m6
|
||||
%xdefine m9 m3
|
||||
%xdefine m13 m7
|
||||
%xdefine m7 m0
|
||||
%xdefine m8 m1
|
||||
|
@ -663,36 +689,41 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
mova m8, m4 ; min
|
||||
.k_loop:
|
||||
%if ARCH_X86_64
|
||||
movd m2, [priq+kq*2] ; pri_taps
|
||||
movd m3, [secq+kq*2] ; sec_taps
|
||||
pshufb m2, m14
|
||||
pshufb m3, m14
|
||||
ACCUMULATE_TAP 0*2, [rsp+ 0], m0, m2, %1, %3
|
||||
ACCUMULATE_TAP 2*2, [rsp+16], m1, m3, %1, %3
|
||||
ACCUMULATE_TAP 6*2, [rsp+16], m1, m3, %1, %3
|
||||
movd m2, [priq+kq] ; pri_taps
|
||||
movd m3, [secq+kq] ; sec_taps
|
||||
pshufb m2, m15
|
||||
pshufb m3, m15
|
||||
ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
|
||||
ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
|
||||
ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
|
||||
%else
|
||||
movd m2, [priq+kq*2] ; pri_taps
|
||||
pshufb m2, m14
|
||||
ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x20], m2, %1, %3
|
||||
movd m2, [priq+kq] ; pri_taps
|
||||
pshufb m2, m15
|
||||
ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
|
||||
|
||||
movd m2, [secq+kq*2] ; sec_taps
|
||||
pshufb m2, m14
|
||||
ACCUMULATE_TAP 2*2, [esp+0x10], [esp+0x30], m2, %1, %3
|
||||
ACCUMULATE_TAP 6*2, [esp+0x10], [esp+0x30], m2, %1, %3
|
||||
movd m2, [secq+kq] ; sec_taps
|
||||
pshufb m2, m15
|
||||
ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
|
||||
ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
|
||||
%endif
|
||||
|
||||
dec kq
|
||||
jge .k_loop
|
||||
|
||||
pcmpgtw m11, m15, m13
|
||||
paddw m13, m11
|
||||
%if cpuflag(sse4)
|
||||
pcmpgtw m6, m15, m13
|
||||
%else
|
||||
pxor m6, m6
|
||||
pcmpgtw m6, m13
|
||||
%endif
|
||||
paddw m13, m6
|
||||
pmulhrsw m13, [PIC_sym(pw_2048)]
|
||||
paddw m4, m13
|
||||
pminsw m4, m7
|
||||
pmaxsw m4, m8
|
||||
packuswb m4, m4
|
||||
movif32 dstd, [esp+0x18]
|
||||
movif32 strided, [esp+0x1C]
|
||||
movif32 dstd, [esp+0x38]
|
||||
movif32 strided, [esp+0x3C]
|
||||
%if %1 == 4
|
||||
movd [dstq+strideq*0], m4
|
||||
psrlq m4, 32
|
||||
|
@ -715,11 +746,10 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \
|
|||
RET
|
||||
%endmacro
|
||||
|
||||
cdef_filter_fn 8, 8, 32
|
||||
cdef_filter_fn 4, 8, 32
|
||||
cdef_filter_fn 4, 4, 32
|
||||
|
||||
%macro MULLD 2
|
||||
%if cpuflag(sse4)
|
||||
pmulld %1, %2
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
%define m15 m1
|
||||
%endif
|
||||
|
@ -727,10 +757,12 @@ cdef_filter_fn 4, 4, 32
|
|||
pmullw %1, %2
|
||||
pslld m15, 16
|
||||
paddd %1, m15
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
||||
%macro CDEF_DIR 0
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
|
||||
lea stride3q, [strideq*3]
|
||||
movq m1, [srcq+strideq*0]
|
||||
movhps m1, [srcq+strideq*1]
|
||||
|
@ -785,7 +817,7 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
pmaddwd m9, m9
|
||||
phaddd m9, m8
|
||||
SWAP m8, m9
|
||||
MULLD m8, [div_table+48]
|
||||
MULLD m8, [div_table%+SUFFIX+48]
|
||||
|
||||
pslldq m9, m1, 2
|
||||
psrldq m10, m1, 14
|
||||
|
@ -819,8 +851,8 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
punpcklwd m9, m10
|
||||
pmaddwd m11, m11
|
||||
pmaddwd m9, m9
|
||||
MULLD m11, [div_table+16]
|
||||
MULLD m9, [div_table+0]
|
||||
MULLD m11, [div_table%+SUFFIX+16]
|
||||
MULLD m9, [div_table%+SUFFIX+0]
|
||||
paddd m9, m11 ; cost[0a-d]
|
||||
|
||||
pslldq m10, m0, 14
|
||||
|
@ -855,8 +887,8 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
punpcklwd m10, m11
|
||||
pmaddwd m12, m12
|
||||
pmaddwd m10, m10
|
||||
MULLD m12, [div_table+16]
|
||||
MULLD m10, [div_table+0]
|
||||
MULLD m12, [div_table%+SUFFIX+16]
|
||||
MULLD m10, [div_table%+SUFFIX+0]
|
||||
paddd m10, m12 ; cost[4a-d]
|
||||
phaddd m9, m10 ; cost[0a/b,4a/b]
|
||||
|
||||
|
@ -881,14 +913,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
paddw m4, m6
|
||||
paddw m5, m15 ; partial_sum_alt[3] right
|
||||
paddw m4, m14 ; partial_sum_alt[3] left
|
||||
pshuflw m5, m5, q3012
|
||||
punpckhwd m6, m4, m5
|
||||
punpcklwd m4, m5
|
||||
pmaddwd m6, m6
|
||||
pshuflw m6, m5, q3012
|
||||
punpckhwd m5, m4
|
||||
punpcklwd m4, m6
|
||||
pmaddwd m5, m5
|
||||
pmaddwd m4, m4
|
||||
MULLD m6, [div_table+48]
|
||||
MULLD m4, [div_table+32]
|
||||
paddd m4, m6 ; cost[7a-d]
|
||||
MULLD m5, [div_table%+SUFFIX+48]
|
||||
MULLD m4, [div_table%+SUFFIX+32]
|
||||
paddd m4, m5 ; cost[7a-d]
|
||||
|
||||
pslldq m5, m10, 6
|
||||
psrldq m6, m10, 10
|
||||
|
@ -901,14 +933,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
paddw m5, m11
|
||||
paddw m6, m12
|
||||
paddw m5, m13
|
||||
pshuflw m6, m6, q3012
|
||||
punpckhwd m7, m5, m6
|
||||
punpcklwd m5, m6
|
||||
pmaddwd m7, m7
|
||||
pshuflw m7, m6, q3012
|
||||
punpckhwd m6, m5
|
||||
punpcklwd m5, m7
|
||||
pmaddwd m6, m6
|
||||
pmaddwd m5, m5
|
||||
MULLD m7, [div_table+48]
|
||||
MULLD m5, [div_table+32]
|
||||
paddd m5, m7 ; cost[5a-d]
|
||||
MULLD m6, [div_table%+SUFFIX+48]
|
||||
MULLD m5, [div_table%+SUFFIX+32]
|
||||
paddd m5, m6 ; cost[5a-d]
|
||||
|
||||
pslldq m6, m1, 2
|
||||
psrldq m7, m1, 14
|
||||
|
@ -921,14 +953,14 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
paddw m6, m10
|
||||
paddw m7, m13 ; partial_sum_alt[3] right
|
||||
paddw m6, m12 ; partial_sum_alt[3] left
|
||||
pshuflw m7, m7, q3012
|
||||
punpckhwd m10, m6, m7
|
||||
punpcklwd m6, m7
|
||||
pmaddwd m10, m10
|
||||
pshuflw m10, m7, q3012
|
||||
punpckhwd m7, m6
|
||||
punpcklwd m6, m10
|
||||
pmaddwd m7, m7
|
||||
pmaddwd m6, m6
|
||||
MULLD m10, [div_table+48]
|
||||
MULLD m6, [div_table+32]
|
||||
paddd m6, m10 ; cost[1a-d]
|
||||
MULLD m7, [div_table%+SUFFIX+48]
|
||||
MULLD m6, [div_table%+SUFFIX+32]
|
||||
paddd m6, m7 ; cost[1a-d]
|
||||
|
||||
pshufd m0, m0, q1032
|
||||
pshufd m1, m1, q1032
|
||||
|
@ -946,61 +978,62 @@ cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
|
|||
paddw m10, m14
|
||||
paddw m11, m2
|
||||
paddw m10, m3
|
||||
pshuflw m11, m11, q3012
|
||||
punpckhwd m12, m10, m11
|
||||
punpcklwd m10, m11
|
||||
pmaddwd m12, m12
|
||||
pshuflw m12, m11, q3012
|
||||
punpckhwd m11, m10
|
||||
punpcklwd m10, m12
|
||||
pmaddwd m11, m11
|
||||
pmaddwd m10, m10
|
||||
MULLD m12, [div_table+48]
|
||||
MULLD m10, [div_table+32]
|
||||
paddd m10, m12 ; cost[3a-d]
|
||||
MULLD m11, [div_table%+SUFFIX+48]
|
||||
MULLD m10, [div_table%+SUFFIX+32]
|
||||
paddd m10, m11 ; cost[3a-d]
|
||||
|
||||
phaddd m0, m9, m8 ; cost[0,4,2,6]
|
||||
phaddd m6, m5
|
||||
phaddd m10, m4
|
||||
phaddd m1, m6, m10 ; cost[1,5,3,7]
|
||||
phaddd m9, m8 ; cost[0,4,2,6]
|
||||
phaddd m6, m10
|
||||
phaddd m5, m4
|
||||
phaddd m6, m5 ; cost[1,3,5,7]
|
||||
pshufd m4, m9, q3120
|
||||
|
||||
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
|
||||
pand m3, m2, m1
|
||||
pandn m4, m2, m0
|
||||
por m3, m4 ; higher 4 values
|
||||
pshufd m1, m1, q2301
|
||||
pshufd m0, m0, q2301
|
||||
pand m1, m2, m1
|
||||
pandn m4, m2, m0
|
||||
por m0, m4, m1 ; 4 values at idx^4 offset
|
||||
pand m14, m2, [pd_0to7+16]
|
||||
pandn m15, m2, [pd_0to7]
|
||||
por m15, m14
|
||||
; now find the best cost
|
||||
%if cpuflag(sse4)
|
||||
pmaxsd m9, m6
|
||||
pshufd m0, m9, q1032
|
||||
pmaxsd m0, m9
|
||||
pshufd m1, m0, q2301
|
||||
pmaxsd m0, m1 ; best cost
|
||||
%else
|
||||
pcmpgtd m0, m9, m6
|
||||
pand m9, m0
|
||||
pandn m0, m6
|
||||
por m9, m0
|
||||
pshufd m1, m9, q1032
|
||||
pcmpgtd m0, m9, m1
|
||||
pand m9, m0
|
||||
pandn m0, m1
|
||||
por m9, m0
|
||||
pshufd m1, m9, q2301
|
||||
pcmpgtd m0, m9, m1
|
||||
pand m9, m0
|
||||
pandn m0, m1
|
||||
por m0, m9
|
||||
%endif
|
||||
|
||||
punpckhqdq m4, m3, m0
|
||||
punpcklqdq m3, m0
|
||||
pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5]
|
||||
punpcklqdq m5, m5
|
||||
pand m6, m5, m4
|
||||
pandn m7, m5, m3
|
||||
por m6, m7 ; { highest 2 values, complements at idx^4 }
|
||||
movhlps m14, m15
|
||||
pand m14, m5, m14
|
||||
pandn m13, m5, m15
|
||||
por m15, m13, m14
|
||||
|
||||
pshufd m7, m6, q3311
|
||||
pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3]
|
||||
punpcklqdq m8, m8
|
||||
pand m9, m8, m7
|
||||
pandn m10, m8, m6
|
||||
por m9, m10 ; max
|
||||
movhlps m10, m9 ; complement at idx^4
|
||||
psubd m9, m10
|
||||
psrld m9, 10
|
||||
movd [varq], m9
|
||||
pshufd m14, m15, q1111
|
||||
pand m14, m8, m14
|
||||
pandn m13, m8, m15
|
||||
por m15, m13, m14
|
||||
movd eax, m15
|
||||
%else
|
||||
; get direction and variance
|
||||
punpckhdq m1, m4, m6
|
||||
punpckldq m4, m6
|
||||
psubd m2, m0, m1
|
||||
psubd m3, m0, m4
|
||||
mova [rsp+0x00], m2 ; emulate ymm in stack
|
||||
mova [rsp+0x10], m3
|
||||
pcmpeqd m1, m0 ; compute best cost mask
|
||||
pcmpeqd m4, m0
|
||||
packssdw m4, m1
|
||||
pmovmskb eax, m4 ; get byte-idx from mask
|
||||
tzcnt eax, eax
|
||||
mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm
|
||||
shr eax, 1 ; get direction by converting byte-idx to word-idx
|
||||
shr r1d, 10
|
||||
mov [varq], r1d
|
||||
%else
|
||||
cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
||||
%define PIC_reg r4
|
||||
LEA PIC_reg, PIC_base_offset
|
||||
|
@ -1065,7 +1098,7 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
pmaddwd m0, m0
|
||||
|
||||
phaddd m2, m0
|
||||
MULLD m2, [PIC_sym(div_table)+48]
|
||||
MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
|
||||
mova [esp+0x30], m2
|
||||
|
||||
mova m1, [esp+0x10]
|
||||
|
@ -1103,8 +1136,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
punpcklwd m0, m1
|
||||
pmaddwd m2, m2
|
||||
pmaddwd m0, m0
|
||||
MULLD m2, [PIC_sym(div_table)+16]
|
||||
MULLD m0, [PIC_sym(div_table)+0]
|
||||
MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
|
||||
paddd m0, m2 ; cost[0a-d]
|
||||
mova [esp+0x40], m0
|
||||
|
||||
|
@ -1144,8 +1177,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
punpcklwd m0, m1
|
||||
pmaddwd m2, m2
|
||||
pmaddwd m0, m0
|
||||
MULLD m2, [PIC_sym(div_table)+16]
|
||||
MULLD m0, [PIC_sym(div_table)+0]
|
||||
MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
|
||||
paddd m0, m2 ; cost[4a-d]
|
||||
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
|
||||
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
|
||||
|
@ -1181,8 +1214,8 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
punpcklwd m0, m1
|
||||
pmaddwd m2, m2
|
||||
pmaddwd m0, m0
|
||||
MULLD m2, [PIC_sym(div_table)+48]
|
||||
MULLD m0, [PIC_sym(div_table)+32]
|
||||
MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
|
||||
paddd m0, m2 ; cost[7a-d]
|
||||
mova [esp+0x40], m0
|
||||
|
||||
|
@ -1197,44 +1230,44 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
paddw m0, m1
|
||||
paddw m7, m4
|
||||
paddw m0, m2
|
||||
pshuflw m7, m7, q3012
|
||||
punpckhwd m2, m0, m7
|
||||
punpcklwd m0, m7
|
||||
pmaddwd m2, m2
|
||||
pshuflw m2, m7, q3012
|
||||
punpckhwd m7, m0
|
||||
punpcklwd m0, m2
|
||||
pmaddwd m7, m7
|
||||
pmaddwd m0, m0
|
||||
MULLD m2, [PIC_sym(div_table)+48]
|
||||
MULLD m0, [PIC_sym(div_table)+32]
|
||||
paddd m0, m2 ; cost[5a-d]
|
||||
MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
|
||||
paddd m0, m7 ; cost[5a-d]
|
||||
mova [esp+0x50], m0
|
||||
|
||||
mova m1, [esp+0x10]
|
||||
mova m7, [esp+0x10]
|
||||
mova m2, [esp+0x20]
|
||||
pslldq m0, m1, 2
|
||||
psrldq m1, 14
|
||||
pslldq m0, m7, 2
|
||||
psrldq m7, 14
|
||||
pslldq m4, m2, 4
|
||||
psrldq m2, 12
|
||||
pslldq m5, m3, 6
|
||||
psrldq m6, m3, 10
|
||||
paddw m0, [esp+0x00]
|
||||
paddw m1, m2
|
||||
paddw m7, m2
|
||||
paddw m4, m5
|
||||
paddw m1, m6 ; partial_sum_alt[3] right
|
||||
paddw m7, m6 ; partial_sum_alt[3] right
|
||||
paddw m0, m4 ; partial_sum_alt[3] left
|
||||
pshuflw m1, m1, q3012
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
pmaddwd m2, m2
|
||||
pshuflw m2, m7, q3012
|
||||
punpckhwd m7, m0
|
||||
punpcklwd m0, m2
|
||||
pmaddwd m7, m7
|
||||
pmaddwd m0, m0
|
||||
MULLD m2, [PIC_sym(div_table)+48]
|
||||
MULLD m0, [PIC_sym(div_table)+32]
|
||||
paddd m0, m2 ; cost[1a-d]
|
||||
phaddd m0, [esp+0x50]
|
||||
mova [esp+0x50], m0
|
||||
MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
|
||||
paddd m0, m7 ; cost[1a-d]
|
||||
SWAP m0, m4
|
||||
|
||||
pshufd m0, [esp+0x00], q1032
|
||||
pshufd m1, [esp+0x10], q1032
|
||||
pshufd m2, [esp+0x20], q1032
|
||||
pshufd m3, m3, q1032
|
||||
mova [esp+0x00], m4
|
||||
|
||||
pslldq m4, m0, 6
|
||||
psrldq m0, 10
|
||||
|
@ -1247,60 +1280,76 @@ cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
|
|||
paddw m5, m6
|
||||
paddw m0, m2
|
||||
paddw m4, m5
|
||||
pshuflw m0, m0, q3012
|
||||
punpckhwd m2, m4, m0
|
||||
punpcklwd m4, m0
|
||||
pmaddwd m2, m2
|
||||
pshuflw m2, m0, q3012
|
||||
punpckhwd m0, m4
|
||||
punpcklwd m4, m2
|
||||
pmaddwd m0, m0
|
||||
pmaddwd m4, m4
|
||||
MULLD m2, [PIC_sym(div_table)+48]
|
||||
MULLD m4, [PIC_sym(div_table)+32]
|
||||
paddd m4, m2 ; cost[3a-d]
|
||||
phaddd m4, [esp+0x40]
|
||||
MULLD m0, [PIC_sym(div_table%+SUFFIX)+48]
|
||||
MULLD m4, [PIC_sym(div_table%+SUFFIX)+32]
|
||||
paddd m4, m0 ; cost[3a-d]
|
||||
|
||||
mova m1, [esp+0x50]
|
||||
mova m1, [esp+0x00]
|
||||
mova m2, [esp+0x50]
|
||||
mova m0, [esp+0x30] ; cost[0,4,2,6]
|
||||
phaddd m1, m4 ; cost[1,5,3,7]
|
||||
phaddd m1, m4
|
||||
phaddd m2, [esp+0x40] ; cost[1,3,5,7]
|
||||
phaddd m1, m2
|
||||
pshufd m2, m0, q3120
|
||||
|
||||
pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
|
||||
pand m3, m2, m1
|
||||
pandn m4, m2, m0
|
||||
por m3, m4 ; higher 4 values
|
||||
pshufd m1, m1, q2301
|
||||
pshufd m0, m0, q2301
|
||||
pand m1, m2, m1
|
||||
pandn m4, m2, m0
|
||||
por m0, m4, m1 ; 4 values at idx^4 offset
|
||||
pand m5, m2, [PIC_sym(pd_0to7)+16]
|
||||
pandn m6, m2, [PIC_sym(pd_0to7)]
|
||||
por m6, m5
|
||||
; now find the best cost
|
||||
%if cpuflag(sse4)
|
||||
pmaxsd m0, m1
|
||||
pshufd m3, m0, q1032
|
||||
pmaxsd m3, m0
|
||||
pshufd m0, m3, q2301
|
||||
pmaxsd m0, m3
|
||||
%else
|
||||
pcmpgtd m3, m0, m1
|
||||
pand m0, m3
|
||||
pandn m3, m1
|
||||
por m0, m3
|
||||
pshufd m4, m0, q1032
|
||||
pcmpgtd m3, m0, m4
|
||||
pand m0, m3
|
||||
pandn m3, m4
|
||||
por m0, m3
|
||||
pshufd m4, m0, q2301
|
||||
pcmpgtd m3, m0, m4
|
||||
pand m0, m3
|
||||
pandn m3, m4
|
||||
por m0, m3
|
||||
%endif
|
||||
|
||||
punpckhqdq m4, m3, m0
|
||||
punpcklqdq m3, m0
|
||||
pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5]
|
||||
punpcklqdq m0, m0
|
||||
pand m1, m0, m4
|
||||
pandn m7, m0, m3
|
||||
por m1, m7 ; { highest 2 values, complements at idx^4 }
|
||||
movhlps m5, m6
|
||||
pand m5, m0, m5
|
||||
pandn m3, m0, m6
|
||||
por m6, m3, m5
|
||||
|
||||
pshufd m7, m1, q3311
|
||||
pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3]
|
||||
punpcklqdq m2, m2
|
||||
pand m0, m2, m7
|
||||
pandn m7, m2, m1
|
||||
por m0, m7 ; max
|
||||
movhlps m7, m0 ; complement at idx^4
|
||||
psubd m0, m7
|
||||
psrld m0, 10
|
||||
movd [varq], m0
|
||||
pshufd m5, m6, q1111
|
||||
pand m5, m2, m5
|
||||
pandn m3, m2, m6
|
||||
por m6, m3, m5
|
||||
movd eax, m6
|
||||
%endif
|
||||
; get direction and variance
|
||||
punpckhdq m3, m2, m1
|
||||
punpckldq m2, m1
|
||||
psubd m1, m0, m3
|
||||
psubd m4, m0, m2
|
||||
mova [esp+0x00], m1 ; emulate ymm in stack
|
||||
mova [esp+0x10], m4
|
||||
pcmpeqd m3, m0 ; compute best cost mask
|
||||
pcmpeqd m2, m0
|
||||
packssdw m2, m3
|
||||
pmovmskb eax, m2 ; get byte-idx from mask
|
||||
tzcnt eax, eax
|
||||
mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
|
||||
shr eax, 1 ; get direction by converting byte-idx to word-idx
|
||||
shr r1d, 10
|
||||
mov [vard], r1d
|
||||
%endif
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse4
|
||||
CDEF_FILTER 8, 8, 32
|
||||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
CDEF_DIR
|
||||
|
||||
INIT_XMM ssse3
|
||||
CDEF_FILTER 8, 8, 32
|
||||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
CDEF_DIR
|
|
@ -58,6 +58,7 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
|
|||
decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
|
||||
|
@ -67,6 +68,10 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
|
|||
decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
|
||||
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
|
||||
|
||||
decl_pal_pred_fn(dav1d_pal_pred_ssse3);
|
||||
|
||||
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
||||
|
@ -81,6 +86,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
|||
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
|
||||
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
|
||||
c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
|
||||
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3;
|
||||
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
|
||||
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
|
||||
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
|
||||
|
@ -90,6 +96,10 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
|||
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3;
|
||||
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3;
|
||||
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_ssse3;
|
||||
#endif
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -86,6 +86,17 @@ decl_itx16_fns(16, 4, ssse3);
|
|||
decl_itx16_fns( 8, 16, ssse3);
|
||||
decl_itx16_fns(16, 8, ssse3);
|
||||
decl_itx12_fns(16, 16, ssse3);
|
||||
decl_itx2_fns ( 8, 32, ssse3);
|
||||
decl_itx2_fns (32, 8, ssse3);
|
||||
decl_itx2_fns (16, 32, ssse3);
|
||||
decl_itx2_fns (32, 16, ssse3);
|
||||
decl_itx2_fns (32, 32, ssse3);
|
||||
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
|
||||
|
||||
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
|
||||
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
|
||||
|
@ -138,6 +149,16 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
|
|||
assign_itx16_fn(R, 8, 16, ssse3);
|
||||
assign_itx16_fn(R, 16, 8, ssse3);
|
||||
assign_itx12_fn(, 16, 16, ssse3);
|
||||
assign_itx2_fn (R, 8, 32, ssse3);
|
||||
assign_itx2_fn (R, 32, 8, ssse3);
|
||||
assign_itx2_fn (R, 16, 32, ssse3);
|
||||
assign_itx2_fn (R, 32, 16, ssse3);
|
||||
assign_itx2_fn (, 32, 32, ssse3);
|
||||
assign_itx1_fn (R, 16, 64, ssse3);
|
||||
assign_itx1_fn (R, 32, 64, ssse3);
|
||||
assign_itx1_fn (R, 64, 16, ssse3);
|
||||
assign_itx1_fn (R, 64, 32, ssse3);
|
||||
assign_itx1_fn ( , 64, 64, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,287 @@
|
|||
; Copyright © 2019, VideoLAN and dav1d authors
|
||||
; Copyright © 2019, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 64 ; avoids cacheline splits
|
||||
|
||||
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
pw_0xff00: times 8 dw 0xff00
|
||||
pw_32: times 8 dw 32
|
||||
|
||||
struc msac
|
||||
.buf: resq 1
|
||||
.end: resq 1
|
||||
.dif: resq 1
|
||||
.rng: resd 1
|
||||
.cnt: resd 1
|
||||
.update_cdf: resd 1
|
||||
endstruc
|
||||
|
||||
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 3
|
||||
%define buf rsp+8 ; shadow space
|
||||
%else
|
||||
DECLARE_REG_TMP 0
|
||||
%define buf rsp-40 ; red zone
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
|
||||
movd m2, [sq+msac.rng]
|
||||
movq m1, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movq m3, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
pand m2, [rax]
|
||||
mova m0, m1
|
||||
psrlw m1, 6
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
movq m2, [rax+nsq*2]
|
||||
pshuflw m3, m3, q3333
|
||||
paddw m1, m2
|
||||
mova [buf+16], m1
|
||||
psubusw m1, m3
|
||||
pxor m2, m2
|
||||
pcmpeqw m1, m2 ; c >= v
|
||||
pmovmskb eax, m1
|
||||
test r3d, r3d
|
||||
jz .renorm ; !allow_update_cdf
|
||||
|
||||
; update_cdf:
|
||||
movzx r3d, word [cdfq+r4*2] ; count
|
||||
pcmpeqw m2, m2
|
||||
mov r2d, r3d
|
||||
shr r3d, 4
|
||||
cmp r4d, 4
|
||||
sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
|
||||
cmp r2d, 32
|
||||
adc r2d, 0 ; count + (count < 32)
|
||||
movd m3, r3d
|
||||
pavgw m2, m1 ; i >= val ? -1 : 32768
|
||||
psubw m2, m0 ; for (i = 0; i < val; i++)
|
||||
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
|
||||
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
|
||||
movq [cdfq], m0
|
||||
mov [cdfq+r4*2], r2w
|
||||
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
mov r4, [sq+msac.dif]
|
||||
movzx r1d, word [buf+rax+16] ; v
|
||||
movzx r2d, word [buf+rax+14] ; u
|
||||
shr eax, 1
|
||||
.renorm2:
|
||||
not r4
|
||||
sub r2d, r1d ; rng
|
||||
shl r1, 48
|
||||
add r4, r1 ; ~dif
|
||||
mov r1d, [sq+msac.cnt]
|
||||
movifnidn t0, sq
|
||||
bsr ecx, r2d
|
||||
xor ecx, 15 ; d
|
||||
shl r2d, cl
|
||||
shl r4, cl
|
||||
mov [t0+msac.rng], r2d
|
||||
not r4
|
||||
sub r1d, ecx
|
||||
jge .end ; no refill required
|
||||
|
||||
; refill:
|
||||
mov r2, [t0+msac.buf]
|
||||
mov rcx, [t0+msac.end]
|
||||
lea r5, [r2+8]
|
||||
cmp r5, rcx
|
||||
jg .refill_eob
|
||||
mov r2, [r2]
|
||||
lea ecx, [r1+23]
|
||||
add r1d, 16
|
||||
shr ecx, 3 ; shift_bytes
|
||||
bswap r2
|
||||
sub r5, rcx
|
||||
shl ecx, 3 ; shift_bits
|
||||
shr r2, cl
|
||||
sub ecx, r1d ; shift_bits - 16 - cnt
|
||||
mov r1d, 48
|
||||
shl r2, cl
|
||||
mov [t0+msac.buf], r5
|
||||
sub r1d, ecx ; cnt + 64 - shift_bits
|
||||
xor r4, r2
|
||||
.end:
|
||||
mov [t0+msac.cnt], r1d
|
||||
mov [t0+msac.dif], r4
|
||||
RET
|
||||
.refill_eob: ; avoid overreading the input buffer
|
||||
mov r5, rcx
|
||||
mov ecx, 40
|
||||
sub ecx, r1d ; c
|
||||
.refill_eob_loop:
|
||||
cmp r2, r5
|
||||
jge .refill_eob_end ; eob reached
|
||||
movzx r1d, byte [r2]
|
||||
inc r2
|
||||
shl r1, cl
|
||||
xor r4, r1
|
||||
sub ecx, 8
|
||||
jge .refill_eob_loop
|
||||
.refill_eob_end:
|
||||
mov r1d, 40
|
||||
sub r1d, ecx
|
||||
mov [t0+msac.buf], r2
|
||||
mov [t0+msac.dif], r4
|
||||
mov [t0+msac.cnt], r1d
|
||||
RET
|
||||
|
||||
cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
|
||||
movd m2, [sq+msac.rng]
|
||||
movu m1, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movq m3, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
punpcklqdq m2, m2
|
||||
mova m0, m1
|
||||
psrlw m1, 6
|
||||
pand m2, [rax]
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
movu m2, [rax+nsq*2]
|
||||
pshuflw m3, m3, q3333
|
||||
paddw m1, m2
|
||||
punpcklqdq m3, m3
|
||||
mova [buf+16], m1
|
||||
psubusw m1, m3
|
||||
pxor m2, m2
|
||||
pcmpeqw m1, m2
|
||||
pmovmskb eax, m1
|
||||
test r3d, r3d
|
||||
jz m(msac_decode_symbol_adapt4).renorm
|
||||
movzx r3d, word [cdfq+r4*2]
|
||||
pcmpeqw m2, m2
|
||||
mov r2d, r3d
|
||||
shr r3d, 4
|
||||
cmp r4d, 4 ; may be called with n_symbols < 4
|
||||
sbb r3d, -5
|
||||
cmp r2d, 32
|
||||
adc r2d, 0
|
||||
movd m3, r3d
|
||||
pavgw m2, m1
|
||||
psubw m2, m0
|
||||
psubw m0, m1
|
||||
psraw m2, m3
|
||||
paddw m0, m2
|
||||
movu [cdfq], m0
|
||||
mov [cdfq+r4*2], r2w
|
||||
jmp m(msac_decode_symbol_adapt4).renorm
|
||||
|
||||
cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
|
||||
movd m4, [sq+msac.rng]
|
||||
movu m2, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movu m3, [cdfq+16]
|
||||
movq m5, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
%if WIN64
|
||||
sub rsp, 48 ; need 36 bytes, shadow space is only 32
|
||||
%endif
|
||||
pshuflw m4, m4, q0000
|
||||
movd [buf-4], m4
|
||||
punpcklqdq m4, m4
|
||||
mova m0, m2
|
||||
psrlw m2, 6
|
||||
mova m1, m3
|
||||
psrlw m3, 6
|
||||
pand m4, [rax]
|
||||
psllw m2, 7
|
||||
psllw m3, 7
|
||||
pmulhuw m2, m4
|
||||
pmulhuw m3, m4
|
||||
movu m4, [rax+nsq*2]
|
||||
pshuflw m5, m5, q3333
|
||||
paddw m2, m4
|
||||
psubw m4, [rax-pw_0xff00+pw_32]
|
||||
punpcklqdq m5, m5
|
||||
paddw m3, m4
|
||||
mova [buf], m2
|
||||
mova [buf+16], m3
|
||||
psubusw m2, m5
|
||||
psubusw m3, m5
|
||||
pxor m4, m4
|
||||
pcmpeqw m2, m4
|
||||
pcmpeqw m3, m4
|
||||
packsswb m5, m2, m3
|
||||
pmovmskb eax, m5
|
||||
test r3d, r3d
|
||||
jz .renorm
|
||||
movzx r3d, word [cdfq+r4*2]
|
||||
pcmpeqw m4, m4
|
||||
mova m5, m4
|
||||
lea r2d, [r3+80] ; only support n_symbols >= 4
|
||||
shr r2d, 4
|
||||
cmp r3d, 32
|
||||
adc r3d, 0
|
||||
pavgw m4, m2
|
||||
pavgw m5, m3
|
||||
psubw m4, m0
|
||||
psubw m0, m2
|
||||
movd m2, r2d
|
||||
psubw m5, m1
|
||||
psubw m1, m3
|
||||
psraw m4, m2
|
||||
psraw m5, m2
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
movu [cdfq], m0
|
||||
movu [cdfq+16], m1
|
||||
mov [cdfq+r4*2], r3w
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
mov r4, [sq+msac.dif]
|
||||
movzx r1d, word [buf+rax*2]
|
||||
movzx r2d, word [buf+rax*2-2]
|
||||
%if WIN64
|
||||
add rsp, 48
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm2
|
||||
|
||||
%endif
|
|
@ -62,6 +62,7 @@ static const struct {
|
|||
const char *name;
|
||||
void (*func)(void);
|
||||
} tests[] = {
|
||||
{ "msac", checkasm_check_msac },
|
||||
#if CONFIG_8BPC
|
||||
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
|
||||
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
|
||||
|
|
|
@ -57,6 +57,7 @@ int xor128_rand(void);
|
|||
name##_8bpc(void); \
|
||||
name##_16bpc(void)
|
||||
|
||||
void checkasm_check_msac(void);
|
||||
decl_check_bitfns(void checkasm_check_cdef);
|
||||
decl_check_bitfns(void checkasm_check_ipred);
|
||||
decl_check_bitfns(void checkasm_check_itx);
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/msac.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/* The normal code doesn't use function pointers */
|
||||
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
|
||||
typedef struct {
|
||||
decode_symbol_adapt_fn symbol_adapt4;
|
||||
decode_symbol_adapt_fn symbol_adapt8;
|
||||
decode_symbol_adapt_fn symbol_adapt16;
|
||||
} MsacDSPContext;
|
||||
|
||||
static void randomize_cdf(uint16_t *const cdf, int n) {
|
||||
for (int i = 16; i > n; i--)
|
||||
cdf[i] = rnd(); /* randomize padding */
|
||||
cdf[n] = cdf[n-1] = 0;
|
||||
while (--n > 0)
|
||||
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
|
||||
}
|
||||
|
||||
/* memcmp() on structs can have weird behavior due to padding etc. */
|
||||
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
|
||||
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
|
||||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
|
||||
a->allow_update_cdf != b->allow_update_cdf;
|
||||
}
|
||||
|
||||
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
|
||||
if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
|
||||
for (int ns = n_min; ns <= n_max; ns++) { \
|
||||
dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update); \
|
||||
s_a = s_c; \
|
||||
randomize_cdf(cdf[0], ns); \
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
|
||||
for (int i = 0; i < 64; i++) { \
|
||||
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
|
||||
unsigned a_res = call_new(&s_a, cdf[1], ns); \
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
|
||||
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
|
||||
{ \
|
||||
fail(); \
|
||||
} \
|
||||
} \
|
||||
if (cdf_update && ns == n) \
|
||||
bench_new(&s_a, cdf[0], n); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void check_decode_symbol_adapt(MsacDSPContext *const c) {
|
||||
/* Use an aligned CDF buffer for more consistent benchmark
|
||||
* results, and a misaligned one for checking correctness. */
|
||||
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
|
||||
MsacContext s_c, s_a;
|
||||
uint8_t buf[1024];
|
||||
for (int i = 0; i < 1024; i++)
|
||||
buf[i] = rnd();
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
|
||||
CHECK_SYMBOL_ADAPT( 4, 1, 5);
|
||||
CHECK_SYMBOL_ADAPT( 8, 1, 8);
|
||||
CHECK_SYMBOL_ADAPT(16, 4, 16);
|
||||
report("decode_symbol_adapt");
|
||||
}
|
||||
|
||||
void checkasm_check_msac(void) {
|
||||
MsacDSPContext c;
|
||||
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
|
||||
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
|
||||
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
|
||||
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
|
||||
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
check_decode_symbol_adapt(&c);
|
||||
}
|
|
@ -34,7 +34,10 @@ endif
|
|||
libdav1d_nasm_objs_if_needed = []
|
||||
|
||||
if is_asm_enabled
|
||||
checkasm_sources = files('checkasm/checkasm.c')
|
||||
checkasm_sources = files(
|
||||
'checkasm/checkasm.c',
|
||||
'checkasm/msac.c',
|
||||
)
|
||||
|
||||
checkasm_tmpl_sources = files(
|
||||
'checkasm/cdef.c',
|
||||
|
|
Загрузка…
Ссылка в новой задаче