Add SIMD support for CDEF dering for sse2/ssse3 and neon
Change-Id: Ibaaed850ddceba9c3db542eaf4a1c623ce6b412b
This commit is contained in:
Родитель
7faea43653
Коммит
b8ff6aaf5d
|
@ -851,22 +851,6 @@ specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
|
|||
add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
|
||||
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
|
||||
|
||||
if (aom_config("CONFIG_CDEF") eq "yes") {
|
||||
add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
# VS compiling for 32 bit targets does not support vector types in
|
||||
# structs as arguments, which makes the v256 type of the intrinsics
|
||||
# hard to support, so optimizations for this target are disabled.
|
||||
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
|
||||
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
|
||||
}
|
||||
}
|
||||
|
||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
|
||||
specialize qw/aom_highbd_lpf_vertical_16 sse2/;
|
||||
|
|
|
@ -231,47 +231,32 @@ if (CONFIG_CDEF)
|
|||
"${AOM_ROOT}/av1/common/cdef.c"
|
||||
"${AOM_ROOT}/av1/common/cdef.h"
|
||||
"${AOM_ROOT}/av1/common/od_dering.c"
|
||||
"${AOM_ROOT}/av1/common/od_dering.h")
|
||||
"${AOM_ROOT}/av1/common/od_dering.h"
|
||||
"${AOM_ROOT}/av1/common/od_dering_simd.h")
|
||||
|
||||
set(AOM_AV1_ENCODER_SOURCES
|
||||
${AOM_AV1_ENCODER_SOURCES}
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo.c"
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo.h"
|
||||
"${AOM_ROOT}/av1/encoder/pickcdef.c")
|
||||
|
||||
set(AOM_AV1_COMMON_SSE2_INTRIN
|
||||
${AOM_AV1_COMMON_SSE2_INTRIN}
|
||||
"${AOM_ROOT}/av1/common/clpf_sse2.c")
|
||||
"${AOM_ROOT}/av1/common/clpf_sse2.c"
|
||||
"${AOM_ROOT}/av1/common/od_dering_sse2.c")
|
||||
|
||||
set(AOM_AV1_COMMON_SSSE3_INTRIN
|
||||
${AOM_AV1_COMMON_SSSE3_INTRIN}
|
||||
"${AOM_ROOT}/av1/common/clpf_ssse3.c")
|
||||
"${AOM_ROOT}/av1/common/clpf_ssse3.c"
|
||||
"${AOM_ROOT}/av1/common/od_dering_ssse3.c")
|
||||
|
||||
set(AOM_AV1_COMMON_SSE4_1_INTRIN
|
||||
${AOM_AV1_COMMON_SSE4_1_INTRIN}
|
||||
"${AOM_ROOT}/av1/common/clpf_sse4.c")
|
||||
"${AOM_ROOT}/av1/common/clpf_sse4.c"
|
||||
"${AOM_ROOT}/av1/common/od_dering_sse4.c")
|
||||
|
||||
set(AOM_AV1_COMMON_NEON_INTRIN
|
||||
${AOM_AV1_COMMON_NEON_INTRIN}
|
||||
"${AOM_ROOT}/av1/common/clpf_neon.c")
|
||||
|
||||
set(AOM_AV1_ENCODER_SSE2_INTRIN
|
||||
${AOM_AV1_ENCODER_SSE2_INTRIN}
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo_sse2.c")
|
||||
|
||||
set(AOM_AV1_ENCODER_SSSE3_INTRIN
|
||||
${AOM_AV1_ENCODER_SSSE3_INTRIN}
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo_ssse3.c")
|
||||
|
||||
set(AOM_AV1_ENCODER_SSE4_1_INTRIN
|
||||
${AOM_AV1_ENCODER_SSE4_1_INTRIN}
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo_sse4.c"
|
||||
"${AOM_ROOT}/av1/common/x86/od_dering_sse4.c"
|
||||
"${AOM_ROOT}/av1/common/x86/od_dering_sse4.h")
|
||||
|
||||
set(AOM_AV1_ENCODER_NEON_INTRIN
|
||||
${AOM_AV1_ENCODER_NEON_INTRIN}
|
||||
"${AOM_ROOT}/av1/encoder/clpf_rdo_neon.c")
|
||||
"${AOM_ROOT}/av1/common/clpf_neon.c"
|
||||
"${AOM_ROOT}/av1/common/od_dering_neon.c")
|
||||
endif ()
|
||||
|
||||
if (CONFIG_EXT_INTER)
|
||||
|
|
|
@ -94,10 +94,13 @@ AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
|
|||
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
|
||||
AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/od_dering_sse2.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/od_dering_ssse3.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/od_dering_sse4.c
|
||||
AV1_COMMON_SRCS-$(HAVE_NEON) += common/od_dering_neon.c
|
||||
AV1_COMMON_SRCS-yes += common/od_dering.c
|
||||
AV1_COMMON_SRCS-yes += common/od_dering.h
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
|
||||
AV1_COMMON_SRCS-yes += common/od_dering_simd.h
|
||||
AV1_COMMON_SRCS-yes += common/cdef.c
|
||||
AV1_COMMON_SRCS-yes += common/cdef.h
|
||||
endif
|
||||
|
|
|
@ -753,14 +753,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
|||
# Deringing Functions
|
||||
|
||||
if (aom_config("CONFIG_CDEF") eq "yes") {
|
||||
add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
|
||||
specialize qw/od_dir_find8 sse4_1/;
|
||||
|
||||
add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
|
||||
specialize qw/od_filter_dering_direction_4x4 sse4_1/;
|
||||
|
||||
add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
|
||||
specialize qw/od_filter_dering_direction_8x8 sse4_1/;
|
||||
# VS compiling for 32 bit targets does not support vector types in
|
||||
# structs as arguments, which makes the v256 type of the intrinsics
|
||||
# hard to support, so optimizations for this target are disabled.
|
||||
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
|
||||
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/od_dir_find8 sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/od_filter_dering_direction_4x4 sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/od_filter_dering_direction_8x8 sse2 ssse3 sse4_1 neon/;
|
||||
}
|
||||
}
|
||||
|
||||
# PVQ Functions
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
*/
|
||||
|
||||
#include "av1/common/clpf.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./av1_rtcd.h"
|
||||
#include "aom/aom_image.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./av1_rtcd.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_ports/bitops.h"
|
||||
#include "av1/common/clpf_simd_kernel.h"
|
||||
|
|
|
@ -56,10 +56,4 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
|
|||
dering_list *dlist, int dering_count, int level,
|
||||
int clpf_strength, int clpf_damping, int coeff_shift,
|
||||
int skip_dering, int hbd);
|
||||
int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir);
|
||||
int od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir);
|
||||
#endif
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#include "av1/common/od_dering.h"
|
||||
#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
|
||||
#define AOM_COMMON_OD_DERING_X86_SSE4_H_
|
||||
#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
|
||||
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
#define SIMD_FUNC(name) name##_neon
|
||||
#include "./od_dering_simd.h"
|
|
@ -0,0 +1,341 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "./od_dering.h"
|
||||
|
||||
/* partial A is a 16-bit vector of the form:
|
||||
[x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
|
||||
[0 y1 y2 y3 y4 y5 y6 y7].
|
||||
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
|
||||
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
|
||||
and const2. */
|
||||
static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
|
||||
v128 const2) {
|
||||
v128 tmp;
|
||||
/* Reverse partial B. */
|
||||
partialb = v128_shuffle_8(
|
||||
partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
|
||||
/* Interleave the x and y values of identical indices and pair x8 with 0. */
|
||||
tmp = partiala;
|
||||
partiala = v128_ziplo_16(partialb, partiala);
|
||||
partialb = v128_ziphi_16(partialb, tmp);
|
||||
/* Square and add the corresponding x and y values. */
|
||||
partiala = v128_madd_s16(partiala, partiala);
|
||||
partialb = v128_madd_s16(partialb, partialb);
|
||||
/* Multiply by constant. */
|
||||
partiala = v128_mullo_s32(partiala, const1);
|
||||
partialb = v128_mullo_s32(partialb, const2);
|
||||
/* Sum all results. */
|
||||
partiala = v128_add_32(partiala, partialb);
|
||||
return partiala;
|
||||
}
|
||||
|
||||
static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
|
||||
v128 t0, t1, t2, t3;
|
||||
t0 = v128_ziplo_32(x1, x0);
|
||||
t1 = v128_ziplo_32(x3, x2);
|
||||
t2 = v128_ziphi_32(x1, x0);
|
||||
t3 = v128_ziphi_32(x3, x2);
|
||||
x0 = v128_ziplo_64(t1, t0);
|
||||
x1 = v128_ziphi_64(t1, t0);
|
||||
x2 = v128_ziplo_64(t3, t2);
|
||||
x3 = v128_ziphi_64(t3, t2);
|
||||
return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
|
||||
}
|
||||
|
||||
/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
|
||||
to compute the remaining directions. */
|
||||
static INLINE void compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
|
||||
v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
|
||||
v128 partial6;
|
||||
v128 tmp;
|
||||
/* Partial sums for lines 0 and 1. */
|
||||
partial4a = v128_shl_n_byte(lines[0], 14);
|
||||
partial4b = v128_shr_n_byte(lines[0], 2);
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
|
||||
tmp = v128_add_16(lines[0], lines[1]);
|
||||
partial5a = v128_shl_n_byte(tmp, 10);
|
||||
partial5b = v128_shr_n_byte(tmp, 6);
|
||||
partial7a = v128_shl_n_byte(tmp, 4);
|
||||
partial7b = v128_shr_n_byte(tmp, 12);
|
||||
partial6 = tmp;
|
||||
|
||||
/* Partial sums for lines 2 and 3. */
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
|
||||
tmp = v128_add_16(lines[2], lines[3]);
|
||||
partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
|
||||
partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
|
||||
partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
|
||||
partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
|
||||
partial6 = v128_add_16(partial6, tmp);
|
||||
|
||||
/* Partial sums for lines 4 and 5. */
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
|
||||
tmp = v128_add_16(lines[4], lines[5]);
|
||||
partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
|
||||
partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
|
||||
partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
|
||||
partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
|
||||
partial6 = v128_add_16(partial6, tmp);
|
||||
|
||||
/* Partial sums for lines 6 and 7. */
|
||||
partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
|
||||
partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
|
||||
partial4a = v128_add_16(partial4a, lines[7]);
|
||||
tmp = v128_add_16(lines[6], lines[7]);
|
||||
partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
|
||||
partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
|
||||
partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
|
||||
partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
|
||||
partial6 = v128_add_16(partial6, tmp);
|
||||
|
||||
/* Compute costs in terms of partial sums. */
|
||||
partial4a =
|
||||
fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
|
||||
v128_from_32(105, 120, 140, 168));
|
||||
partial7a =
|
||||
fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
|
||||
v128_from_32(105, 105, 105, 140));
|
||||
partial5a =
|
||||
fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
|
||||
v128_from_32(105, 105, 105, 140));
|
||||
partial6 = v128_madd_s16(partial6, partial6);
|
||||
partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
|
||||
|
||||
partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
|
||||
v128_store_unaligned(tmp_cost1, partial4a);
|
||||
}
|
||||
|
||||
/* transpose and reverse the order of the lines -- equivalent to a 90-degree
|
||||
counter-clockwise rotation of the pixels. */
|
||||
static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
|
||||
const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
|
||||
const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
|
||||
const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
|
||||
const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
|
||||
const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
|
||||
const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
|
||||
const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
|
||||
const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
|
||||
|
||||
const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
|
||||
const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
|
||||
const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
|
||||
const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
|
||||
const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
|
||||
const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
|
||||
const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
|
||||
const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
|
||||
|
||||
res[7] = v128_ziplo_64(tr1_1, tr1_0);
|
||||
res[6] = v128_ziphi_64(tr1_1, tr1_0);
|
||||
res[5] = v128_ziplo_64(tr1_3, tr1_2);
|
||||
res[4] = v128_ziphi_64(tr1_3, tr1_2);
|
||||
res[3] = v128_ziplo_64(tr1_5, tr1_4);
|
||||
res[2] = v128_ziphi_64(tr1_5, tr1_4);
|
||||
res[1] = v128_ziplo_64(tr1_7, tr1_6);
|
||||
res[0] = v128_ziphi_64(tr1_7, tr1_6);
|
||||
}
|
||||
|
||||
int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
|
||||
int coeff_shift) {
|
||||
int i;
|
||||
int32_t cost[8];
|
||||
int32_t best_cost = 0;
|
||||
int best_dir = 0;
|
||||
v128 lines[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
lines[i] = v128_load_unaligned(&img[i * stride]);
|
||||
lines[i] =
|
||||
v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
|
||||
}
|
||||
|
||||
/* Compute "mostly vertical" directions. */
|
||||
compute_directions(lines, cost + 4);
|
||||
|
||||
array_reverse_transpose_8x8(lines, lines);
|
||||
|
||||
/* Compute "mostly horizontal" directions. */
|
||||
compute_directions(lines, cost);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
if (cost[i] > best_cost) {
|
||||
best_cost = cost[i];
|
||||
best_dir = i;
|
||||
}
|
||||
}
|
||||
|
||||
/* Difference between the optimal variance and the variance along the
|
||||
orthogonal direction. Again, the sum(x^2) terms cancel out. */
|
||||
*var = best_cost - cost[(best_dir + 4) & 7];
|
||||
/* We'd normally divide by 840, but dividing by 1024 is close enough
|
||||
for what we're going to do with this. */
|
||||
*var >>= 10;
|
||||
return best_dir;
|
||||
}
|
||||
|
||||
static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) {
|
||||
return v128_cmplt_s16(v128_abs_s16(in), threshold);
|
||||
}
|
||||
|
||||
int SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int i;
|
||||
v128 sum;
|
||||
v128 p;
|
||||
v128 cmp;
|
||||
v128 row;
|
||||
v128 res;
|
||||
v128 tmp;
|
||||
v128 thresh;
|
||||
v128 total_abs;
|
||||
int off1, off2;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
total_abs = v128_zero();
|
||||
thresh = v128_dup_16(threshold);
|
||||
for (i = 0; i < 4; i += 2) {
|
||||
sum = v128_zero();
|
||||
row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE]));
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 2);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 2);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
res = v128_add_16(sum, v128_dup_16(8));
|
||||
res = v128_shr_n_s16(res, 4);
|
||||
total_abs = v128_add_16(total_abs, v128_abs_s16(res));
|
||||
res = v128_add_16(row, res);
|
||||
v64_store_aligned(&y[i * ystride], v128_low_v64(res));
|
||||
v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res));
|
||||
}
|
||||
return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 2) >> 2;
|
||||
}
|
||||
|
||||
int SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int i;
|
||||
v128 sum;
|
||||
v128 p;
|
||||
v128 cmp;
|
||||
v128 row;
|
||||
v128 res;
|
||||
v128 thresh;
|
||||
v128 total_abs;
|
||||
int off1, off2, off3;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
|
||||
total_abs = v128_zero();
|
||||
thresh = v128_dup_16(threshold);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sum = v128_zero();
|
||||
row = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE]);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_add_16(p, v128_shl_n_16(p, 1));
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_add_16(p, v128_shl_n_16(p, 1));
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 1);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 1);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
res = v128_add_16(sum, v128_dup_16(8));
|
||||
res = v128_shr_n_s16(res, 4);
|
||||
total_abs = v128_add_16(total_abs, v128_abs_s16(res));
|
||||
res = v128_add_16(row, res);
|
||||
v128_store_unaligned(&y[i * ystride], res);
|
||||
}
|
||||
return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 8) >> 4;
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
#define SIMD_FUNC(name) name##_sse2
|
||||
#include "./od_dering_simd.h"
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
#define SIMD_FUNC(name) name##_sse4_1
|
||||
#include "./od_dering_simd.h"
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
#define SIMD_FUNC(name) name##_ssse3
|
||||
#include "./od_dering_simd.h"
|
|
@ -1,387 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "av1/common/x86/od_dering_sse4.h"
|
||||
|
||||
/* partial A is a 16-bit vector of the form:
|
||||
[x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
|
||||
[0 y1 y2 y3 y4 y5 y6 y7].
|
||||
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
|
||||
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
|
||||
and const2. */
|
||||
static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
|
||||
__m128i const1, __m128i const2) {
|
||||
__m128i tmp;
|
||||
/* Reverse partial B. */
|
||||
partialb = _mm_shuffle_epi8(
|
||||
partialb,
|
||||
_mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
|
||||
/* Interleave the x and y values of identical indices and pair x8 with 0. */
|
||||
tmp = partiala;
|
||||
partiala = _mm_unpacklo_epi16(partiala, partialb);
|
||||
partialb = _mm_unpackhi_epi16(tmp, partialb);
|
||||
/* Square and add the corresponding x and y values. */
|
||||
partiala = _mm_madd_epi16(partiala, partiala);
|
||||
partialb = _mm_madd_epi16(partialb, partialb);
|
||||
/* Multiply by constant. */
|
||||
partiala = _mm_mullo_epi32(partiala, const1);
|
||||
partialb = _mm_mullo_epi32(partialb, const2);
|
||||
/* Sum all results. */
|
||||
partiala = _mm_add_epi32(partiala, partialb);
|
||||
return partiala;
|
||||
}
|
||||
|
||||
static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
|
||||
__m128i t0, t1, t2, t3;
|
||||
t0 = _mm_unpacklo_epi32(x0, x1);
|
||||
t1 = _mm_unpacklo_epi32(x2, x3);
|
||||
t2 = _mm_unpackhi_epi32(x0, x1);
|
||||
t3 = _mm_unpackhi_epi32(x2, x3);
|
||||
x0 = _mm_unpacklo_epi64(t0, t1);
|
||||
x1 = _mm_unpackhi_epi64(t0, t1);
|
||||
x2 = _mm_unpacklo_epi64(t2, t3);
|
||||
x3 = _mm_unpackhi_epi64(t2, t3);
|
||||
return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
|
||||
}
|
||||
|
||||
/* Horizontal sum of 8x16-bit unsigned values. */
|
||||
static INLINE int32_t hsum_epi16(__m128i a) {
|
||||
a = _mm_madd_epi16(a, _mm_set1_epi16(1));
|
||||
a = _mm_hadd_epi32(a, a);
|
||||
a = _mm_hadd_epi32(a, a);
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
||||
|
||||
/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
|
||||
to compute the remaining directions. */
|
||||
static INLINE __m128i compute_directions(__m128i lines[8],
|
||||
int32_t tmp_cost1[4]) {
|
||||
__m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
|
||||
__m128i partial6;
|
||||
__m128i tmp;
|
||||
/* Partial sums for lines 0 and 1. */
|
||||
partial4a = _mm_slli_si128(lines[0], 14);
|
||||
partial4b = _mm_srli_si128(lines[0], 2);
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
|
||||
tmp = _mm_add_epi16(lines[0], lines[1]);
|
||||
partial5a = _mm_slli_si128(tmp, 10);
|
||||
partial5b = _mm_srli_si128(tmp, 6);
|
||||
partial7a = _mm_slli_si128(tmp, 4);
|
||||
partial7b = _mm_srli_si128(tmp, 12);
|
||||
partial6 = tmp;
|
||||
|
||||
/* Partial sums for lines 2 and 3. */
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
|
||||
tmp = _mm_add_epi16(lines[2], lines[3]);
|
||||
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
|
||||
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
|
||||
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
|
||||
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
|
||||
partial6 = _mm_add_epi16(partial6, tmp);
|
||||
|
||||
/* Partial sums for lines 4 and 5. */
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
|
||||
tmp = _mm_add_epi16(lines[4], lines[5]);
|
||||
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
|
||||
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
|
||||
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
|
||||
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
|
||||
partial6 = _mm_add_epi16(partial6, tmp);
|
||||
|
||||
/* Partial sums for lines 6 and 7. */
|
||||
partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
|
||||
partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
|
||||
partial4a = _mm_add_epi16(partial4a, lines[7]);
|
||||
tmp = _mm_add_epi16(lines[6], lines[7]);
|
||||
partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
|
||||
partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
|
||||
partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
|
||||
partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
|
||||
partial6 = _mm_add_epi16(partial6, tmp);
|
||||
|
||||
/* Compute costs in terms of partial sums. */
|
||||
partial4a =
|
||||
fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
|
||||
_mm_set_epi32(105, 120, 140, 168));
|
||||
partial7a =
|
||||
fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
|
||||
_mm_set_epi32(105, 105, 105, 140));
|
||||
partial5a =
|
||||
fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
|
||||
_mm_set_epi32(105, 105, 105, 140));
|
||||
partial6 = _mm_madd_epi16(partial6, partial6);
|
||||
partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
|
||||
|
||||
partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
|
||||
_mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
|
||||
return partial4a;
|
||||
}
|
||||
|
||||
/* transpose and reverse the order of the lines -- equivalent to a 90-degree
|
||||
counter-clockwise rotation of the pixels. */
|
||||
static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
|
||||
const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
|
||||
const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
|
||||
const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
|
||||
const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
|
||||
const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
|
||||
const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
|
||||
const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
|
||||
|
||||
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
|
||||
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
|
||||
|
||||
res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
|
||||
res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
|
||||
res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
|
||||
res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
|
||||
res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
|
||||
res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
|
||||
res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
|
||||
res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
|
||||
}
|
||||
|
||||
int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
|
||||
int coeff_shift) {
|
||||
int i;
|
||||
int32_t cost[8];
|
||||
int32_t best_cost = 0;
|
||||
int best_dir = 0;
|
||||
__m128i lines[8];
|
||||
__m128i dir03, dir47;
|
||||
__m128i max;
|
||||
for (i = 0; i < 8; i++) {
|
||||
lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
|
||||
lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
|
||||
_mm_set1_epi16(128));
|
||||
}
|
||||
|
||||
/* Compute "mostly vertical" directions. */
|
||||
dir47 = compute_directions(lines, cost + 4);
|
||||
|
||||
array_reverse_transpose_8x8(lines, lines);
|
||||
|
||||
/* Compute "mostly horizontal" directions. */
|
||||
dir03 = compute_directions(lines, cost);
|
||||
|
||||
#if 1
|
||||
max = _mm_max_epi32(dir03, dir47);
|
||||
max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
|
||||
_mm_setr_epi32(-1, -2, -3, -4));
|
||||
dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
|
||||
_mm_setr_epi32(-5, -6, -7, -8));
|
||||
dir03 = _mm_max_epu32(dir03, dir47);
|
||||
dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
|
||||
dir03 =
|
||||
_mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
|
||||
|
||||
best_dir = _mm_cvtsi128_si32(dir03);
|
||||
best_cost = _mm_cvtsi128_si32(max);
|
||||
#else
|
||||
for (i = 0; i < 8; i++) {
|
||||
if (cost[i] > best_cost) {
|
||||
best_cost = cost[i];
|
||||
best_dir = i;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Difference between the optimal variance and the variance along the
|
||||
orthogonal direction. Again, the sum(x^2) terms cancel out. */
|
||||
*var = best_cost - cost[(best_dir + 4) & 7];
|
||||
/* We'd normally divide by 840, but dividing by 1024 is close enough
|
||||
for what we're going to do with this. */
|
||||
*var >>= 10;
|
||||
return best_dir;
|
||||
}
|
||||
|
||||
static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
|
||||
return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
|
||||
}
|
||||
|
||||
int od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int i;
|
||||
__m128i sum;
|
||||
__m128i p;
|
||||
__m128i cmp;
|
||||
__m128i row;
|
||||
__m128i res;
|
||||
__m128i tmp;
|
||||
__m128i thresh;
|
||||
__m128i total_abs;
|
||||
int off1, off2;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
total_abs = _mm_setzero_si128();
|
||||
thresh = _mm_set1_epi16(threshold);
|
||||
for (i = 0; i < 4; i += 2) {
|
||||
sum = _mm_set1_epi16(0);
|
||||
row = _mm_unpacklo_epi64(
|
||||
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
|
||||
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = _mm_unpacklo_epi64(
|
||||
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
|
||||
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
|
||||
p = _mm_sub_epi16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_slli_epi16(p, 2);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = _mm_unpacklo_epi64(
|
||||
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
|
||||
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
|
||||
p = _mm_sub_epi16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_slli_epi16(p, 2);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = _mm_unpacklo_epi64(
|
||||
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
|
||||
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
|
||||
p = _mm_sub_epi16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = _mm_unpacklo_epi64(
|
||||
_mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
|
||||
_mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
|
||||
p = _mm_sub_epi16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
res = _mm_add_epi16(sum, _mm_set1_epi16(8));
|
||||
res = _mm_srai_epi16(res, 4);
|
||||
total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
|
||||
res = _mm_add_epi16(row, res);
|
||||
_mm_storel_epi64((__m128i *)&y[i * ystride], res);
|
||||
_mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
|
||||
_mm_unpackhi_epi64(res, res));
|
||||
}
|
||||
return (hsum_epi16(total_abs) + 2) >> 2;
|
||||
}
|
||||
|
||||
int od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int i;
|
||||
__m128i sum;
|
||||
__m128i p;
|
||||
__m128i cmp;
|
||||
__m128i row;
|
||||
__m128i res;
|
||||
__m128i thresh;
|
||||
__m128i total_abs;
|
||||
int off1, off2, off3;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
|
||||
total_abs = _mm_setzero_si128();
|
||||
thresh = _mm_set1_epi16(threshold);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sum = _mm_set1_epi16(0);
|
||||
row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_slli_epi16(p, 1);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_slli_epi16(p, 1);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p = _mm_sub_epi16(
|
||||
_mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = _mm_and_si128(p, cmp);
|
||||
sum = _mm_add_epi16(sum, p);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
res = _mm_add_epi16(sum, _mm_set1_epi16(8));
|
||||
res = _mm_srai_epi16(res, 4);
|
||||
total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
|
||||
res = _mm_add_epi16(row, res);
|
||||
_mm_storeu_si128((__m128i *)&y[i * ystride], res);
|
||||
}
|
||||
return (hsum_epi16(total_abs) + 8) >> 4;
|
||||
}
|
|
@ -1,263 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_ports/bitops.h"
|
||||
#include "av1/common/clpf_simd_kernel.h"
|
||||
|
||||
SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left,
|
||||
int right) {
|
||||
DECLARE_ALIGNED(16, static const uint64_t,
|
||||
c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
|
||||
DECLARE_ALIGNED(16, static const uint64_t,
|
||||
d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
|
||||
DECLARE_ALIGNED(16, static const uint64_t,
|
||||
e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
|
||||
DECLARE_ALIGNED(16, static const uint64_t,
|
||||
f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
|
||||
|
||||
if (!left) { // Left clipping
|
||||
*c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
|
||||
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
|
||||
}
|
||||
if (!right) { // Right clipping
|
||||
*e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
|
||||
*f = v128_shuffle_8(*f, v128_load_aligned(f_shuff));
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
|
||||
int rstride, int ostride, int x0, int y0,
|
||||
int bottom, int right, int y, v128 *o, v128 *r,
|
||||
v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
|
||||
v128 *f, v128 *g, v128 *h) {
|
||||
const v64 k1 = v64_load_aligned(org);
|
||||
const v64 k2 = v64_load_aligned(org + ostride);
|
||||
const v64 l1 = v64_load_aligned(rec);
|
||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
||||
const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride);
|
||||
const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride);
|
||||
*o = v128_from_v64(k1, k2);
|
||||
*r = v128_from_v64(l1, l2);
|
||||
*a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3);
|
||||
*b = v128_from_v64(l3, l1);
|
||||
*g = v128_from_v64(l2, l4);
|
||||
*h = v128_from_v64(l4,
|
||||
v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride));
|
||||
*c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
|
||||
v64_load_unaligned(rec - 2 * !!x0 + rstride));
|
||||
*d = v128_from_v64(v64_load_unaligned(rec - !!x0),
|
||||
v64_load_unaligned(rec - !!x0 + rstride));
|
||||
*e = v128_from_v64(v64_load_unaligned(rec + !!right),
|
||||
v64_load_unaligned(rec + !!right + rstride));
|
||||
*f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
|
||||
v64_load_unaligned(rec + 2 * !!right + rstride));
|
||||
clip_sides(c, d, e, f, x0, right);
|
||||
}
|
||||
|
||||
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
|
||||
int rstride, int ostride, int x0, int y0,
|
||||
int width, int height, int *sum0, int *sum1,
|
||||
unsigned int strength, int size,
|
||||
unsigned int dmp) {
|
||||
const int bottom = height - 2 - y0;
|
||||
const int right = width - 8 - x0;
|
||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||
int y;
|
||||
|
||||
if (size != 8) { // Fallback to plain C
|
||||
aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
|
||||
sum1, strength, size, dmp);
|
||||
return;
|
||||
}
|
||||
|
||||
rec += x0 + y0 * rstride;
|
||||
org += x0 + y0 * ostride;
|
||||
|
||||
for (y = 0; y < 8; y += 2) {
|
||||
v128 a, b, c, d, e, f, g, h, o, r;
|
||||
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
|
||||
&a, &b, &c, &d, &e, &f, &g, &h);
|
||||
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||
ssd1 = v128_ssd_u8(ssd1, o,
|
||||
calc_delta(r, a, b, c, d, e, f, g, h, strength, dmp));
|
||||
rec += rstride * 2;
|
||||
org += ostride * 2;
|
||||
}
|
||||
*sum0 += v128_ssd_u8_sum(ssd0);
|
||||
*sum1 += v128_ssd_u8_sum(ssd1);
|
||||
}
|
||||
|
||||
SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
|
||||
v128 d, v128 e, v128 f, v128 g, v128 h,
|
||||
ssd128_internal *ssd1, ssd128_internal *ssd2,
|
||||
ssd128_internal *ssd3, unsigned int dmp) {
|
||||
*ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1, dmp));
|
||||
*ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2, dmp));
|
||||
*ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4, dmp));
|
||||
}
|
||||
|
||||
// Test multiple filter strengths at once.
|
||||
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
|
||||
int rstride, int ostride, int x0, int y0,
|
||||
int width, int height, int *sum, int size,
|
||||
unsigned int dmp) {
|
||||
const int bottom = height - 2 - y0;
|
||||
const int right = width - 8 - x0;
|
||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd2 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd3 = v128_ssd_u8_init();
|
||||
int y;
|
||||
|
||||
if (size != 8) { // Fallback to plain C
|
||||
aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
|
||||
sum, size, dmp);
|
||||
return;
|
||||
}
|
||||
|
||||
rec += x0 + y0 * rstride;
|
||||
org += x0 + y0 * ostride;
|
||||
|
||||
for (y = 0; y < 8; y += 2) {
|
||||
v128 a, b, c, d, e, f, g, h, o, r;
|
||||
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
|
||||
&a, &b, &c, &d, &e, &f, &g, &h);
|
||||
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||
calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, dmp);
|
||||
rec += 2 * rstride;
|
||||
org += 2 * ostride;
|
||||
}
|
||||
sum[0] += v128_ssd_u8_sum(ssd0);
|
||||
sum[1] += v128_ssd_u8_sum(ssd1);
|
||||
sum[2] += v128_ssd_u8_sum(ssd2);
|
||||
sum[3] += v128_ssd_u8_sum(ssd3);
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
|
||||
int rstride, int ostride, int x0, int y0,
|
||||
int bottom, int right, int y, v128 *o,
|
||||
v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
|
||||
v128 *e, v128 *f, v128 *g, v128 *h,
|
||||
int shift) {
|
||||
const v128 k1 = v128_shr_u16(v128_load_aligned(org), shift);
|
||||
const v128 k2 = v128_shr_u16(v128_load_aligned(org + ostride), shift);
|
||||
const v128 l1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
||||
const v128 l2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
||||
const v128 l3 =
|
||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift);
|
||||
const v128 l4 = v128_shr_u16(
|
||||
v128_load_aligned(rec + ((y != bottom) + 1) * rstride), shift);
|
||||
*o = v128_unziplo_8(k1, k2);
|
||||
*r = v128_unziplo_8(l1, l2);
|
||||
*a = v128_unziplo_8(
|
||||
v128_shr_u16(v128_load_aligned(rec - 2 * (y != -y0) * rstride), shift),
|
||||
l3);
|
||||
*b = v128_unziplo_8(l3, l1);
|
||||
*g = v128_unziplo_8(l2, l4);
|
||||
*h = v128_unziplo_8(
|
||||
l4,
|
||||
v128_shr_u16(v128_load_unaligned(rec + (2 * (y != bottom) + 1) * rstride),
|
||||
shift));
|
||||
*c = v128_unziplo_8(
|
||||
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
|
||||
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
|
||||
*d = v128_unziplo_8(
|
||||
v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
|
||||
v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
|
||||
*e = v128_unziplo_8(
|
||||
v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
|
||||
v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
|
||||
*f = v128_unziplo_8(
|
||||
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
|
||||
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
|
||||
clip_sides(c, d, e, f, x0, right);
|
||||
}
|
||||
|
||||
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
|
||||
int rstride, int ostride, int x0, int y0,
|
||||
int width, int height, int *sum0, int *sum1,
|
||||
unsigned int strength, int size,
|
||||
unsigned int bitdepth,
|
||||
unsigned int damping) {
|
||||
const int shift = bitdepth - 8;
|
||||
const int bottom = height - 2 - y0;
|
||||
const int right = width - 8 - x0;
|
||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||
int y;
|
||||
|
||||
if (size != 8) { // Fallback to plain C
|
||||
aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
|
||||
sum0, sum1, strength, size, bitdepth, damping);
|
||||
return;
|
||||
}
|
||||
|
||||
rec += x0 + y0 * rstride;
|
||||
org += x0 + y0 * ostride;
|
||||
|
||||
for (y = 0; y < 8; y += 2) {
|
||||
v128 a, b, c, d, e, f, g, h, o, r;
|
||||
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
|
||||
&r, &a, &b, &c, &d, &e, &f, &g, &h, shift);
|
||||
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||
ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h,
|
||||
strength >> shift, damping));
|
||||
rec += rstride * 2;
|
||||
org += ostride * 2;
|
||||
}
|
||||
*sum0 += v128_ssd_u8_sum(ssd0);
|
||||
*sum1 += v128_ssd_u8_sum(ssd1);
|
||||
}
|
||||
|
||||
void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
|
||||
const uint16_t *org, int rstride,
|
||||
int ostride, int x0, int y0,
|
||||
int width, int height, int *sum,
|
||||
int size, unsigned int bitdepth,
|
||||
unsigned int damping) {
|
||||
const int bottom = height - 2 - y0;
|
||||
const int right = width - 8 - x0;
|
||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd2 = v128_ssd_u8_init();
|
||||
ssd128_internal ssd3 = v128_ssd_u8_init();
|
||||
int y;
|
||||
|
||||
if (size != 8) { // Fallback to plain C
|
||||
aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
|
||||
height, sum, size, bitdepth, damping);
|
||||
return;
|
||||
}
|
||||
|
||||
rec += x0 + y0 * rstride;
|
||||
org += x0 + y0 * ostride;
|
||||
|
||||
for (y = 0; y < 8; y += 2) {
|
||||
v128 a, b, c, d, e, f, g, h, o, r;
|
||||
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
|
||||
&r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8);
|
||||
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||
calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3,
|
||||
damping);
|
||||
rec += rstride * 2;
|
||||
org += ostride * 2;
|
||||
}
|
||||
sum[0] += v128_ssd_u8_sum(ssd0);
|
||||
sum[1] += v128_ssd_u8_sum(ssd1);
|
||||
sum[2] += v128_ssd_u8_sum(ssd2);
|
||||
sum[3] += v128_ssd_u8_sum(ssd3);
|
||||
}
|
||||
#endif
|
|
@ -15,7 +15,7 @@
|
|||
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./av1_rtcd.h"
|
||||
#include "aom_ports/aom_timer.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
|
|
Загрузка…
Ссылка в новой задаче