diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index fc3456dfb..620333637 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -853,6 +853,7 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (aom_config("CONFIG_CDEF") eq "yes") { add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd"; + add_proto qw/void aom_clpf_hblock_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd"; if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp"; add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp"; @@ -866,6 +867,7 @@ if (aom_config("CONFIG_CDEF") eq "yes") { } if ($opts{config} !~ /libs-x86-win32-vs.*/) { specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; + specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/; } add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd"; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp"; diff --git a/av1/common/clpf.c b/av1/common/clpf.c index bd3202d99..4de033d7d 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -31,6 +31,13 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, return (8 + delta - (delta < 0)) >> 4; } +int av1_clpf_hsample(int X, int A, int B, int C, int D, int s, + unsigned int dmp) { + int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) + + 3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp); + return (4 + delta - (delta < 0)) >> 3; +} + void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int damping) { @@ -78,3 +85,22 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, } } } + +// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally +void aom_clpf_hblock_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + unsigned int strength, unsigned int damping) { + int x, y; + + for (y = y0; y < y0 + sizey; y++) { + for (x = x0; x < x0 + sizex; x++) { + const int X = src[y * sstride + x]; + const int A = src[y * sstride + x - 2]; + const int B = src[y * sstride + x - 1]; + const int C = src[y * sstride + x + 1]; + const int D = src[y * sstride + x + 2]; + const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping); + dst[y * dstride + x] = X + delta; + } + } +} diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 12152c11f..53c3552da 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -175,6 +175,36 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp)); } +// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) + +// 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp) +SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, + unsigned int s, unsigned int dmp) { + const v128 bc = + v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp)); + const v128 delta = v128_add_16( + v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)), + v128_add_16(v128_add_16(bc, bc), bc)); + return v128_add_16( + x, + v128_shr_s16( + v128_add_16(v128_dup_16(4), + v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))), + 3)); +} + +static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, + uint16_t *dst, unsigned int s, unsigned int dmp, + int dstride) { + o = calc_hdelta_hbd(o, a, b, c, d, s, dmp); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + dstride, v128_low_v64(o)); +} + +static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, + uint16_t *dst, unsigned int s, unsigned int dmp) { + v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, dmp)); +} + // Process blocks of width 4, two lines at time. SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, @@ -236,6 +266,57 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, } } +// Process blocks of width 4, horizontal filter, two lines at time. +SIMD_INLINE void clpf_hblock_hbd4(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizey, unsigned int strength, + unsigned int dmp) { + int y; + + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; + + for (y = 0; y < sizey; y += 2) { + const v128 a = v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + sstride)); + const v128 b = v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + sstride)); + const v128 c = v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + sstride)); + const v128 d = v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + sstride)); + + calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src), + v64_load_unaligned(src + sstride)), + a, b, c, d, dst, strength, dmp, dstride); + src += sstride * 2; + dst += dstride * 2; + } +} + +// Process blocks of width 8, horizontal filter, two lines at time. +SIMD_INLINE void clpf_hblock_hbd(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizey, unsigned int strength, + unsigned int dmp) { + int y; + + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; + + for (y = 0; y < sizey; y++) { + const v128 o = v128_load_aligned(src); + const v128 a = v128_load_unaligned(src - 2); + const v128 b = v128_load_unaligned(src - 1); + const v128 c = v128_load_unaligned(src + 1); + const v128 d = v128_load_unaligned(src + 2); + + calc_hdelta_hbd8(o, a, b, c, d, dst, strength, dmp); + src += sstride; + dst += dstride; + } +} + void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, @@ -251,4 +332,20 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, src, dst, sstride, dstride, x0, y0, sizey, strength, dmp); } } + +void SIMD_FUNC(aom_clpf_hblock_hbd)(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizex, int sizey, unsigned int strength, + unsigned int dmp) { + if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { + // Fallback to C for odd sizes: + // * block width not 4 or 8 + // * block heights not a multiple of 2 if the block width is 4 + aom_clpf_hblock_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, + strength, dmp); + } else { + (sizex == 4 ? clpf_hblock_hbd4 : clpf_hblock_hbd)( + src, dst, sstride, dstride, x0, y0, sizey, strength, dmp); + } +} #endif diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c index 491f510ba..c7061c475 100644 --- a/av1/common/od_dering.c +++ b/av1/common/od_dering.c @@ -288,9 +288,10 @@ void od_dering(uint16_t *y, uint16_t *in, int xdec, by = dlist[bi].by; bx = dlist[bi].bx; - aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], - OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize, - 1 << bsize, 1 << bsize, clpf_strength << coeff_shift, - clpf_damping + coeff_shift); + (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd + : aom_clpf_hblock_hbd)( + in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE, + 1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize, + clpf_strength << coeff_shift, clpf_damping + coeff_shift); } }