CDEF: Add damping to dering
high-latency, cpu-used=0: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.1650 | 0.2545 | 0.2977 | -0.0423 | -0.0947 | -0.0725 | -0.0365 low-latency, cpu-used=0: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.4006 | 0.0501 | -0.0108 | -0.1790 | -0.1660 | -0.1992 | -0.2135 low latency, cpu-used=4: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.5508 | -0.2445 | -0.2762 | -0.1981 | -0.2878 | -0.2228 | -0.3733 Change-Id: Ia20df28c8bbb6182215b02016053af33bd498145
This commit is contained in:
Родитель
1d18460fab
Коммит
8ff52fccfa
|
@ -227,6 +227,7 @@ if (CONFIG_CDEF)
|
|||
"${AOM_ROOT}/av1/common/clpf.c"
|
||||
"${AOM_ROOT}/av1/common/clpf.h"
|
||||
"${AOM_ROOT}/av1/common/clpf_simd.h"
|
||||
"${AOM_ROOT}/av1/common/cdef_simd.h"
|
||||
"${AOM_ROOT}/av1/common/cdef.c"
|
||||
"${AOM_ROOT}/av1/common/cdef.h"
|
||||
"${AOM_ROOT}/av1/common/od_dering.c"
|
||||
|
|
|
@ -89,6 +89,7 @@ ifeq ($(CONFIG_CDEF),yes)
|
|||
AV1_COMMON_SRCS-yes += common/clpf.c
|
||||
AV1_COMMON_SRCS-yes += common/clpf.h
|
||||
AV1_COMMON_SRCS-yes += common/clpf_simd.h
|
||||
AV1_COMMON_SRCS-yes += common/cdef_simd.h
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
|
||||
|
|
|
@ -626,8 +626,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
|
|||
add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
|
||||
add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
|
||||
add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
|
||||
add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
|
||||
add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
|
||||
|
||||
add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
|
||||
add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
|
||||
|
|
|
@ -23,6 +23,18 @@
|
|||
#include "av1/common/onyxc_int.h"
|
||||
#include "./od_dering.h"
|
||||
|
||||
static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
|
||||
|
||||
static INLINE int constrain(int diff, int threshold, unsigned int damping) {
|
||||
return threshold
|
||||
? sign(diff) *
|
||||
AOMMIN(
|
||||
abs(diff),
|
||||
AOMMAX(0, threshold - (abs(diff) >>
|
||||
(damping - get_msb(threshold)))))
|
||||
: 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#ifndef AV1_COMMON_CDEF_SIMD_H_
|
||||
#define AV1_COMMON_CDEF_SIMD_H_
|
||||
|
||||
#include "aom_dsp/aom_simd.h"
|
||||
|
||||
// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
|
||||
SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
|
||||
unsigned int adjdamp) {
|
||||
v128 diff = v128_sub_16(a, b);
|
||||
const v128 sign = v128_shr_n_s16(diff, 15);
|
||||
diff = v128_abs_s16(diff);
|
||||
const v128 s =
|
||||
v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
|
||||
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
|
||||
}
|
||||
|
||||
#endif // AV1_COMMON_CDEF_SIMD_H_
|
|
@ -9,18 +9,12 @@
|
|||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "av1/common/clpf.h"
|
||||
#include "./clpf.h"
|
||||
#include "./av1_rtcd.h"
|
||||
#include "./cdef.h"
|
||||
#include "aom/aom_image.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
|
||||
static int sign(int i) { return i < 0 ? -1 : 1; }
|
||||
|
||||
static int constrain(int x, int s, unsigned int damping) {
|
||||
return sign(x) *
|
||||
AOMMIN(abs(x), AOMMAX(0, s - (abs(x) >> (damping - get_msb(s)))));
|
||||
}
|
||||
|
||||
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
|
||||
int H, int s, unsigned int dmp) {
|
||||
int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
|
||||
|
|
|
@ -10,8 +10,9 @@
|
|||
*/
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "./cdef_simd.h"
|
||||
#include "aom_ports/bitops.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
|
||||
SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
|
||||
|
@ -242,17 +243,6 @@ void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
|
|||
}
|
||||
}
|
||||
|
||||
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
|
||||
SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
|
||||
unsigned int adjdamp) {
|
||||
v128 diff = v128_sub_16(a, b);
|
||||
const v128 sign = v128_shr_n_s16(diff, 15);
|
||||
diff = v128_abs_s16(diff);
|
||||
const v128 s =
|
||||
v128_ssub_u16(v128_dup_16(strength), v128_shr_u16(diff, adjdamp));
|
||||
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
|
||||
}
|
||||
|
||||
// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
|
||||
// 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
|
||||
// 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
|
||||
|
@ -261,13 +251,12 @@ SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
|
|||
v128 f, v128 g, v128 h, unsigned int s,
|
||||
unsigned int dmp) {
|
||||
const v128 bdeg = v128_add_16(
|
||||
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)),
|
||||
v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp)));
|
||||
v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
|
||||
v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
|
||||
const v128 delta = v128_add_16(
|
||||
v128_add_16(
|
||||
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)),
|
||||
v128_add_16(constrain_hbd(f, x, s, dmp),
|
||||
constrain_hbd(h, x, s, dmp))),
|
||||
v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
|
||||
v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
|
||||
v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
|
||||
return v128_add_16(
|
||||
x,
|
||||
|
@ -297,9 +286,9 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
|||
SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
|
||||
unsigned int s, unsigned int dmp) {
|
||||
const v128 bc =
|
||||
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp));
|
||||
v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
|
||||
const v128 delta = v128_add_16(
|
||||
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)),
|
||||
v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
|
||||
v128_add_16(v128_add_16(bc, bc), bc));
|
||||
return v128_add_16(
|
||||
x,
|
||||
|
|
|
@ -115,7 +115,7 @@ int od_dir_find8_c(const uint16_t *img, int stride, int32_t *var,
|
|||
/* Smooth in the direction detected. */
|
||||
void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int dir, int damping) {
|
||||
int i;
|
||||
int j;
|
||||
int k;
|
||||
|
@ -134,8 +134,8 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
|
|||
xx;
|
||||
p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
|
||||
xx;
|
||||
if (abs(p0) < threshold) sum += taps[k] * p0;
|
||||
if (abs(p1) < threshold) sum += taps[k] * p1;
|
||||
sum += taps[k] * constrain(p0, threshold, damping);
|
||||
sum += taps[k] * constrain(p1, threshold, damping);
|
||||
}
|
||||
sum = (sum + 8) >> 4;
|
||||
yy = xx + sum;
|
||||
|
@ -147,7 +147,7 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
|
|||
/* Smooth in the direction detected. */
|
||||
void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir) {
|
||||
int dir, int damping) {
|
||||
int i;
|
||||
int j;
|
||||
int k;
|
||||
|
@ -166,8 +166,8 @@ void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
|
|||
xx;
|
||||
p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
|
||||
xx;
|
||||
if (abs(p0) < threshold) sum += taps[k] * p0;
|
||||
if (abs(p1) < threshold) sum += taps[k] * p1;
|
||||
sum += taps[k] * constrain(p0, threshold, damping);
|
||||
sum += taps[k] * constrain(p1, threshold, damping);
|
||||
}
|
||||
sum = (sum + 8) >> 4;
|
||||
yy = xx + sum;
|
||||
|
@ -298,6 +298,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
|
|||
od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
|
||||
od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
|
||||
};
|
||||
clpf_damping += coeff_shift;
|
||||
bsize = OD_DERING_SIZE_LOG2 - xdec;
|
||||
if (!skip_dering) {
|
||||
if (pli == 0) {
|
||||
|
@ -325,7 +326,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
|
|||
(filter_dering_direction[bsize - OD_LOG_BSIZE0])(
|
||||
&y[bi << 2 * bsize], 1 << bsize,
|
||||
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
|
||||
od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
|
||||
od_adjust_thresh(threshold, var[by][bx]), dir[by][bx], 6);
|
||||
}
|
||||
} else {
|
||||
for (bi = 0; bi < dering_count; bi++) {
|
||||
|
@ -334,7 +335,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
|
|||
(filter_dering_direction[bsize - OD_LOG_BSIZE0])(
|
||||
&y[bi << 2 * bsize], 1 << bsize,
|
||||
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
|
||||
dir[by][bx]);
|
||||
dir[by][bx], threshold == 0 ? 0 : get_msb(threshold) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -356,14 +357,14 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
|
|||
dst ? (uint16_t *)dst + py * dstride + px : &y[bi << 2 * bsize],
|
||||
in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsize,
|
||||
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
|
||||
clpf_strength << coeff_shift, clpf_damping + coeff_shift);
|
||||
clpf_strength << coeff_shift, clpf_damping);
|
||||
} else {
|
||||
// Do clpf and write the result to an 8 bit destination
|
||||
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
|
||||
: aom_clpf_hblock)(
|
||||
dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride,
|
||||
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
|
||||
clpf_strength << coeff_shift, clpf_damping + coeff_shift);
|
||||
clpf_strength << coeff_shift, clpf_damping);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -41,7 +41,8 @@ typedef struct {
|
|||
|
||||
typedef void (*od_filter_dering_direction_func)(uint16_t *y, int ystride,
|
||||
const uint16_t *in,
|
||||
int threshold, int dir);
|
||||
int threshold, int dir,
|
||||
int damping);
|
||||
void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
|
||||
dering_list *dlist, int dering_count,
|
||||
int bsize);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
*/
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "./cdef_simd.h"
|
||||
#include "./od_dering.h"
|
||||
|
||||
/* partial A is a 16-bit vector of the form:
|
||||
|
@ -210,141 +211,109 @@ int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
|
|||
return best_dir;
|
||||
}
|
||||
|
||||
static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) {
|
||||
return v128_cmplt_s16(v128_abs_s16(in), threshold);
|
||||
}
|
||||
|
||||
void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
|
||||
const uint16_t *in,
|
||||
int threshold, int dir) {
|
||||
int threshold, int dir,
|
||||
int damping) {
|
||||
int i;
|
||||
v128 sum;
|
||||
v128 p;
|
||||
v128 cmp;
|
||||
v128 row;
|
||||
v128 res;
|
||||
v128 tmp;
|
||||
v128 thresh;
|
||||
int off1, off2;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
thresh = v128_dup_16(threshold);
|
||||
v128 p0, p1, sum, row, res;
|
||||
int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
|
||||
if (threshold) damping -= get_msb(threshold);
|
||||
for (i = 0; i < 4; i += 2) {
|
||||
sum = v128_zero();
|
||||
row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE]));
|
||||
row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]),
|
||||
v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]));
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 2);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_shl_n_16(p, 2);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
// p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
|
||||
p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]),
|
||||
v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1]));
|
||||
p0 = constrain16(p0, row, threshold, damping);
|
||||
|
||||
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]),
|
||||
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2]));
|
||||
p = v128_sub_16(tmp, row);
|
||||
/*if (abs(p) < thresh) sum += taps[k]*p1*/
|
||||
cmp = od_cmplt_abs_epi16(p, thresh);
|
||||
p = v128_and(p, cmp);
|
||||
sum = v128_add_16(sum, p);
|
||||
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
|
||||
p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]),
|
||||
v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1]));
|
||||
p1 = constrain16(p1, row, threshold, damping);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
// sum += 4 * (p0 + p1)
|
||||
sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
|
||||
|
||||
// p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
|
||||
p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]),
|
||||
v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2]));
|
||||
p0 = constrain16(p0, row, threshold, damping);
|
||||
|
||||
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
|
||||
p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]),
|
||||
v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2]));
|
||||
p1 = constrain16(p1, row, threshold, damping);
|
||||
|
||||
// sum += 1 * (p0 + p1)
|
||||
sum = v128_add_16(sum, v128_add_16(p0, p1));
|
||||
|
||||
// res = row + ((sum + 8) >> 4)
|
||||
res = v128_add_16(sum, v128_dup_16(8));
|
||||
res = v128_shr_n_s16(res, 4);
|
||||
res = v128_add_16(row, res);
|
||||
v64_store_aligned(&y[i * ystride], v128_low_v64(res));
|
||||
v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res));
|
||||
v64_store_aligned(&y[i * ystride], v128_high_v64(res));
|
||||
v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
|
||||
}
|
||||
}
|
||||
|
||||
void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
|
||||
const uint16_t *in,
|
||||
int threshold, int dir) {
|
||||
int threshold, int dir,
|
||||
int damping) {
|
||||
int i;
|
||||
v128 sum;
|
||||
v128 p0, p1;
|
||||
v128 cmp;
|
||||
v128 row;
|
||||
v128 res;
|
||||
v128 thresh;
|
||||
int off1, off2, off3;
|
||||
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
|
||||
thresh = v128_dup_16(threshold);
|
||||
v128 sum, p0, p1, row, res;
|
||||
int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
|
||||
int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
|
||||
int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
|
||||
|
||||
if (threshold) damping -= get_msb(threshold);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sum = v128_zero();
|
||||
row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
|
||||
|
||||
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
|
||||
/*p0 = abs(p0) < thresh ? p0 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p0, thresh);
|
||||
p0 = v128_and(p0, cmp);
|
||||
// p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
|
||||
p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]);
|
||||
p0 = constrain16(p0, row, threshold, damping);
|
||||
|
||||
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
|
||||
/*p1 = abs(p1) < thresh ? p1 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p1, thresh);
|
||||
p1 = v128_and(p1, cmp);
|
||||
/*sum += 3*(p0 + p1)*/
|
||||
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
|
||||
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]);
|
||||
p1 = constrain16(p1, row, threshold, damping);
|
||||
|
||||
// sum += 3 * (p0 + p1)
|
||||
p0 = v128_add_16(p0, p1);
|
||||
p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
|
||||
sum = v128_add_16(sum, p0);
|
||||
|
||||
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
|
||||
/*p0 = abs(p0) < thresh ? p0 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p0, thresh);
|
||||
p0 = v128_and(p0, cmp);
|
||||
// p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
|
||||
p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]);
|
||||
p0 = constrain16(p0, row, threshold, damping);
|
||||
|
||||
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
|
||||
/*p1 = abs(p1) < thresh ? p1 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p1, thresh);
|
||||
p1 = v128_and(p1, cmp);
|
||||
/* sum += 2*(p0 + p1)*/
|
||||
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
|
||||
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]);
|
||||
p1 = constrain16(p1, row, threshold, damping);
|
||||
|
||||
// sum += 2 * (p0 + p1)
|
||||
p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
|
||||
sum = v128_add_16(sum, p0);
|
||||
|
||||
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/
|
||||
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
|
||||
/*p0 = abs(p0) < thresh ? p0 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p0, thresh);
|
||||
p0 = v128_and(p0, cmp);
|
||||
// p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
|
||||
p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]);
|
||||
p0 = constrain16(p0, row, threshold, damping);
|
||||
|
||||
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/
|
||||
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
|
||||
/*p1 = abs(p1) < thresh ? p1 : 0*/
|
||||
cmp = od_cmplt_abs_epi16(p1, thresh);
|
||||
p1 = v128_and(p1, cmp);
|
||||
/*sum += (p0 + p1)*/
|
||||
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
|
||||
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]);
|
||||
p1 = constrain16(p1, row, threshold, damping);
|
||||
|
||||
// sum += (p0 + p1)
|
||||
p0 = v128_add_16(p0, p1);
|
||||
sum = v128_add_16(sum, p0);
|
||||
|
||||
/*res = row + ((sum + 8) >> 4)*/
|
||||
// res = row + ((sum + 8) >> 4)
|
||||
res = v128_add_16(sum, v128_dup_16(8));
|
||||
res = v128_shr_n_s16(res, 4);
|
||||
res = v128_add_16(row, res);
|
||||
|
|
|
@ -27,10 +27,9 @@ using libaom_test::ACMRandom;
|
|||
|
||||
namespace {
|
||||
|
||||
typedef void (*dering_dir_t)(uint16_t *y, int ystride, const uint16_t *in,
|
||||
int threshold, int dir);
|
||||
|
||||
typedef std::tr1::tuple<dering_dir_t, dering_dir_t, int> dering_dir_param_t;
|
||||
typedef std::tr1::tuple<od_filter_dering_direction_func,
|
||||
od_filter_dering_direction_func, int>
|
||||
dering_dir_param_t;
|
||||
|
||||
class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
|
||||
public:
|
||||
|
@ -45,18 +44,15 @@ class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
|
|||
|
||||
protected:
|
||||
int bsize;
|
||||
dering_dir_t dering;
|
||||
dering_dir_t ref_dering;
|
||||
od_filter_dering_direction_func dering;
|
||||
od_filter_dering_direction_func ref_dering;
|
||||
};
|
||||
|
||||
typedef CDEFDeringDirTest CDEFDeringSpeedTest;
|
||||
|
||||
void test_dering(int bsize, int iterations,
|
||||
void (*dering)(uint16_t *y, int ystride, const uint16_t *in,
|
||||
int threshold, int dir),
|
||||
void (*ref_dering)(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir)) {
|
||||
od_filter_dering_direction_func dering,
|
||||
od_filter_dering_direction_func ref_dering) {
|
||||
const int size = 8;
|
||||
const int ysize = size + 2 * OD_FILT_VBORDER;
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
|
@ -67,60 +63,65 @@ void test_dering(int bsize, int iterations,
|
|||
memset(d, 0, sizeof(d));
|
||||
|
||||
int error = 0, threshold = 0, dir;
|
||||
int boundary, depth, bits, level, count, errdepth = 0, errthreshold = 0,
|
||||
errboundary = 0;
|
||||
int boundary, damping, depth, bits, level, count,
|
||||
errdepth = 0, errthreshold = 0, errboundary = 0, errdamping = 0;
|
||||
unsigned int pos = 0;
|
||||
|
||||
for (boundary = 0; boundary < 16; boundary++) {
|
||||
for (depth = 8; depth <= 12; depth += 2) {
|
||||
for (count = 0; count < iterations; count++) {
|
||||
for (level = 0; level < (1 << depth) && !error;
|
||||
level += (1 + 4 * !!boundary) << (depth - 8)) {
|
||||
for (bits = 1; bits <= depth && !error; bits++) {
|
||||
for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
|
||||
s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
|
||||
(1 << depth) - 1);
|
||||
if (boundary) {
|
||||
if (boundary & 1) { // Left
|
||||
for (int i = 0; i < ysize; i++)
|
||||
for (int j = 0; j < OD_FILT_HBORDER; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
for (damping = 5 + depth - 8; damping < 7 + depth - 8; damping++) {
|
||||
for (count = 0; count < iterations; count++) {
|
||||
for (level = 0; level < (1 << depth) && !error;
|
||||
level += (1 + 4 * !!boundary) << (depth - 8)) {
|
||||
for (bits = 1; bits <= depth && !error; bits++) {
|
||||
for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
|
||||
s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
|
||||
(1 << depth) - 1);
|
||||
if (boundary) {
|
||||
if (boundary & 1) { // Left
|
||||
for (int i = 0; i < ysize; i++)
|
||||
for (int j = 0; j < OD_FILT_HBORDER; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
if (boundary & 2) { // Right
|
||||
for (int i = 0; i < ysize; i++)
|
||||
for (int j = OD_FILT_HBORDER + size; j < OD_FILT_BSTRIDE;
|
||||
j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
if (boundary & 4) { // Above
|
||||
for (int i = 0; i < OD_FILT_VBORDER; i++)
|
||||
for (int j = 0; j < OD_FILT_BSTRIDE; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
if (boundary & 8) { // Below
|
||||
for (int i = OD_FILT_VBORDER + size; i < ysize; i++)
|
||||
for (int j = 0; j < OD_FILT_BSTRIDE; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
}
|
||||
if (boundary & 2) { // Right
|
||||
for (int i = 0; i < ysize; i++)
|
||||
for (int j = OD_FILT_HBORDER + size; j < OD_FILT_BSTRIDE; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
if (boundary & 4) { // Above
|
||||
for (int i = 0; i < OD_FILT_VBORDER; i++)
|
||||
for (int j = 0; j < OD_FILT_BSTRIDE; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
if (boundary & 8) { // Below
|
||||
for (int i = OD_FILT_VBORDER + size; i < ysize; i++)
|
||||
for (int j = 0; j < OD_FILT_BSTRIDE; j++)
|
||||
s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
|
||||
}
|
||||
}
|
||||
for (dir = 0; dir < 8; dir++) {
|
||||
for (threshold = 0; threshold < 64 << (depth - 8) && !error;
|
||||
threshold += !error << (depth - 8)) {
|
||||
ref_dering(ref_d, size, s + OD_FILT_HBORDER +
|
||||
OD_FILT_VBORDER * OD_FILT_BSTRIDE,
|
||||
threshold, dir);
|
||||
// If dering and ref_dering are the same, we're just testing
|
||||
// speed
|
||||
if (dering != ref_dering)
|
||||
ASM_REGISTER_STATE_CHECK(dering(
|
||||
d, size,
|
||||
s + OD_FILT_HBORDER + OD_FILT_VBORDER * OD_FILT_BSTRIDE,
|
||||
threshold, dir));
|
||||
if (ref_dering != dering) {
|
||||
for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error; pos++) {
|
||||
error = ref_d[pos] != d[pos];
|
||||
errdepth = depth;
|
||||
errthreshold = threshold;
|
||||
errboundary = boundary;
|
||||
for (dir = 0; dir < 8; dir++) {
|
||||
for (threshold = 0; threshold < 64 << (depth - 8) && !error;
|
||||
threshold += (1 + 4 * !!boundary) << (depth - 8)) {
|
||||
ref_dering(ref_d, size, s + OD_FILT_HBORDER +
|
||||
OD_FILT_VBORDER * OD_FILT_BSTRIDE,
|
||||
threshold, dir, damping);
|
||||
// If dering and ref_dering are the same, we're just testing
|
||||
// speed
|
||||
if (dering != ref_dering)
|
||||
ASM_REGISTER_STATE_CHECK(dering(
|
||||
d, size,
|
||||
s + OD_FILT_HBORDER + OD_FILT_VBORDER * OD_FILT_BSTRIDE,
|
||||
threshold, dir, damping));
|
||||
if (ref_dering != dering) {
|
||||
for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error;
|
||||
pos++) {
|
||||
error = ref_d[pos] != d[pos];
|
||||
errdepth = depth;
|
||||
errthreshold = threshold;
|
||||
errboundary = boundary;
|
||||
errdamping = damping;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -138,6 +139,7 @@ void test_dering(int bsize, int iterations,
|
|||
<< " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
|
||||
<< ") " << std::endl
|
||||
<< "threshold: " << errthreshold << std::endl
|
||||
<< "damping: " << errdamping << std::endl
|
||||
<< "depth: " << errdepth << std::endl
|
||||
<< "size: " << bsize << std::endl
|
||||
<< "boundary: " << errboundary << std::endl
|
||||
|
@ -145,12 +147,8 @@ void test_dering(int bsize, int iterations,
|
|||
}
|
||||
|
||||
void test_dering_speed(int bsize, int iterations,
|
||||
void (*dering)(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir),
|
||||
void (*ref_dering)(uint16_t *y, int ystride,
|
||||
const uint16_t *in, int threshold,
|
||||
int dir)) {
|
||||
od_filter_dering_direction_func dering,
|
||||
od_filter_dering_direction_func ref_dering) {
|
||||
aom_usec_timer ref_timer;
|
||||
aom_usec_timer timer;
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче