Limit line buffer to 6 lines
Change-Id: I6fedfa6427865e9a37fbdf9d9c1bf8be55222cba
This commit is contained in:
Родитель
d280a84554
Коммит
3c33def72c
|
@ -853,6 +853,7 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
|
|||
|
||||
if (aom_config("CONFIG_CDEF") eq "yes") {
|
||||
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_hblock_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
|
||||
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
|
||||
|
@ -866,6 +867,7 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
|
|||
}
|
||||
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
|
||||
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
|
||||
specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
|
||||
}
|
||||
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
|
||||
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
|
||||
|
|
|
@ -31,6 +31,13 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
|
|||
return (8 + delta - (delta < 0)) >> 4;
|
||||
}
|
||||
|
||||
int av1_clpf_hsample(int X, int A, int B, int C, int D, int s,
|
||||
unsigned int dmp) {
|
||||
int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
|
||||
3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
|
||||
return (4 + delta - (delta < 0)) >> 3;
|
||||
}
|
||||
|
||||
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
|
||||
int dstride, int x0, int y0, int sizex, int sizey,
|
||||
unsigned int strength, unsigned int damping) {
|
||||
|
@ -78,3 +85,22 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally
|
||||
void aom_clpf_hblock_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
|
||||
int dstride, int x0, int y0, int sizex, int sizey,
|
||||
unsigned int strength, unsigned int damping) {
|
||||
int x, y;
|
||||
|
||||
for (y = y0; y < y0 + sizey; y++) {
|
||||
for (x = x0; x < x0 + sizex; x++) {
|
||||
const int X = src[y * sstride + x];
|
||||
const int A = src[y * sstride + x - 2];
|
||||
const int B = src[y * sstride + x - 1];
|
||||
const int C = src[y * sstride + x + 1];
|
||||
const int D = src[y * sstride + x + 2];
|
||||
const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping);
|
||||
dst[y * dstride + x] = X + delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -175,6 +175,36 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
|||
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
|
||||
}
|
||||
|
||||
// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
|
||||
// 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
|
||||
SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
|
||||
unsigned int s, unsigned int dmp) {
|
||||
const v128 bc =
|
||||
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp));
|
||||
const v128 delta = v128_add_16(
|
||||
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)),
|
||||
v128_add_16(v128_add_16(bc, bc), bc));
|
||||
return v128_add_16(
|
||||
x,
|
||||
v128_shr_s16(
|
||||
v128_add_16(v128_dup_16(4),
|
||||
v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
|
||||
3));
|
||||
}
|
||||
|
||||
static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
|
||||
uint16_t *dst, unsigned int s, unsigned int dmp,
|
||||
int dstride) {
|
||||
o = calc_hdelta_hbd(o, a, b, c, d, s, dmp);
|
||||
v64_store_aligned(dst, v128_high_v64(o));
|
||||
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||
}
|
||||
|
||||
static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
|
||||
uint16_t *dst, unsigned int s, unsigned int dmp) {
|
||||
v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, dmp));
|
||||
}
|
||||
|
||||
// Process blocks of width 4, two lines at time.
|
||||
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
|
||||
int sstride, int dstride, int x0, int y0,
|
||||
|
@ -236,6 +266,57 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
|
|||
}
|
||||
}
|
||||
|
||||
// Process blocks of width 4, horizontal filter, two lines at time.
|
||||
SIMD_INLINE void clpf_hblock_hbd4(const uint16_t *src, uint16_t *dst,
|
||||
int sstride, int dstride, int x0, int y0,
|
||||
int sizey, unsigned int strength,
|
||||
unsigned int dmp) {
|
||||
int y;
|
||||
|
||||
dst += x0 + y0 * dstride;
|
||||
src += x0 + y0 * sstride;
|
||||
|
||||
for (y = 0; y < sizey; y += 2) {
|
||||
const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
|
||||
v64_load_unaligned(src - 2 + sstride));
|
||||
const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
|
||||
v64_load_unaligned(src - 1 + sstride));
|
||||
const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
|
||||
v64_load_unaligned(src + 1 + sstride));
|
||||
const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
|
||||
v64_load_unaligned(src + 2 + sstride));
|
||||
|
||||
calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
|
||||
v64_load_unaligned(src + sstride)),
|
||||
a, b, c, d, dst, strength, dmp, dstride);
|
||||
src += sstride * 2;
|
||||
dst += dstride * 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Process blocks of width 8, horizontal filter, two lines at time.
|
||||
SIMD_INLINE void clpf_hblock_hbd(const uint16_t *src, uint16_t *dst,
|
||||
int sstride, int dstride, int x0, int y0,
|
||||
int sizey, unsigned int strength,
|
||||
unsigned int dmp) {
|
||||
int y;
|
||||
|
||||
dst += x0 + y0 * dstride;
|
||||
src += x0 + y0 * sstride;
|
||||
|
||||
for (y = 0; y < sizey; y++) {
|
||||
const v128 o = v128_load_aligned(src);
|
||||
const v128 a = v128_load_unaligned(src - 2);
|
||||
const v128 b = v128_load_unaligned(src - 1);
|
||||
const v128 c = v128_load_unaligned(src + 1);
|
||||
const v128 d = v128_load_unaligned(src + 2);
|
||||
|
||||
calc_hdelta_hbd8(o, a, b, c, d, dst, strength, dmp);
|
||||
src += sstride;
|
||||
dst += dstride;
|
||||
}
|
||||
}
|
||||
|
||||
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
|
||||
int sstride, int dstride, int x0, int y0,
|
||||
int sizex, int sizey, unsigned int strength,
|
||||
|
@ -251,4 +332,20 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
|
|||
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
|
||||
}
|
||||
}
|
||||
|
||||
void SIMD_FUNC(aom_clpf_hblock_hbd)(const uint16_t *src, uint16_t *dst,
|
||||
int sstride, int dstride, int x0, int y0,
|
||||
int sizex, int sizey, unsigned int strength,
|
||||
unsigned int dmp) {
|
||||
if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
|
||||
// Fallback to C for odd sizes:
|
||||
// * block width not 4 or 8
|
||||
// * block heights not a multiple of 2 if the block width is 4
|
||||
aom_clpf_hblock_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
|
||||
strength, dmp);
|
||||
} else {
|
||||
(sizex == 4 ? clpf_hblock_hbd4 : clpf_hblock_hbd)(
|
||||
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -288,9 +288,10 @@ void od_dering(uint16_t *y, uint16_t *in, int xdec,
|
|||
by = dlist[bi].by;
|
||||
bx = dlist[bi].bx;
|
||||
|
||||
aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)],
|
||||
OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize,
|
||||
1 << bsize, 1 << bsize, clpf_strength << coeff_shift,
|
||||
clpf_damping + coeff_shift);
|
||||
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
|
||||
: aom_clpf_hblock_hbd)(
|
||||
in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE,
|
||||
1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize,
|
||||
clpf_strength << coeff_shift, clpf_damping + coeff_shift);
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче