Limit line buffer to 6 lines

Change-Id: I6fedfa6427865e9a37fbdf9d9c1bf8be55222cba
2017-03-21 12:56:17 +01:00 · 2017-03-21 12:56:17 +01:00 · 3c33def72c
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@ -853,6 +853,7 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;

 if (aom_config("CONFIG_CDEF") eq "yes") {
  add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_hblock_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
    add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
@ -866,6 +867,7 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
  }
  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
  }
  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@ -31,6 +31,13 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
  return (8 + delta - (delta < 0)) >> 4;
 }

+int av1_clpf_hsample(int X, int A, int B, int C, int D, int s,
+                     unsigned int dmp) {
+  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
+              3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
+  return (4 + delta - (delta < 0)) >> 3;
+}
+
 void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
                      int dstride, int x0, int y0, int sizex, int sizey,
                      unsigned int strength, unsigned int damping) {
@ -78,3 +85,22 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
    }
  }
 }
+
+// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally
+void aom_clpf_hblock_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
+                           int dstride, int x0, int y0, int sizex, int sizey,
+                           unsigned int strength, unsigned int damping) {
+  int x, y;
+
+  for (y = y0; y < y0 + sizey; y++) {
+    for (x = x0; x < x0 + sizex; x++) {
+      const int X = src[y * sstride + x];
+      const int A = src[y * sstride + x - 2];
+      const int B = src[y * sstride + x - 1];
+      const int C = src[y * sstride + x + 1];
+      const int D = src[y * sstride + x + 2];
+      const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@ -175,6 +175,36 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
  v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
 }

+// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
+//         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
+SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
+                                 unsigned int s, unsigned int dmp) {
+  const v128 bc =
+      v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp));
+  const v128 delta = v128_add_16(
+      v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)),
+      v128_add_16(v128_add_16(bc, bc), bc));
+  return v128_add_16(
+      x,
+      v128_shr_s16(
+          v128_add_16(v128_dup_16(4),
+                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
+          3));
+}
+
+static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
+                             uint16_t *dst, unsigned int s, unsigned int dmp,
+                             int dstride) {
+  o = calc_hdelta_hbd(o, a, b, c, d, s, dmp);
+  v64_store_aligned(dst, v128_high_v64(o));
+  v64_store_aligned(dst + dstride, v128_low_v64(o));
+}
+
+static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
+                             uint16_t *dst, unsigned int s, unsigned int dmp) {
+  v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, dmp));
+}
+
 // Process blocks of width 4, two lines at time.
 SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
                                 int sstride, int dstride, int x0, int y0,
@ -236,6 +266,57 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
  }
 }

+// Process blocks of width 4, horizontal filter, two lines at time.
+SIMD_INLINE void clpf_hblock_hbd4(const uint16_t *src, uint16_t *dst,
+                                  int sstride, int dstride, int x0, int y0,
+                                  int sizey, unsigned int strength,
+                                  unsigned int dmp) {
+  int y;
+
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
+                                 v64_load_unaligned(src - 2 + sstride));
+    const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
+                                 v64_load_unaligned(src - 1 + sstride));
+    const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
+                                 v64_load_unaligned(src + 1 + sstride));
+    const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
+                                 v64_load_unaligned(src + 2 + sstride));
+
+    calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
+                                   v64_load_unaligned(src + sstride)),
+                     a, b, c, d, dst, strength, dmp, dstride);
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// Process blocks of width 8, horizontal filter, two lines at time.
+SIMD_INLINE void clpf_hblock_hbd(const uint16_t *src, uint16_t *dst,
+                                 int sstride, int dstride, int x0, int y0,
+                                 int sizey, unsigned int strength,
+                                 unsigned int dmp) {
+  int y;
+
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
+
+  for (y = 0; y < sizey; y++) {
+    const v128 o = v128_load_aligned(src);
+    const v128 a = v128_load_unaligned(src - 2);
+    const v128 b = v128_load_unaligned(src - 1);
+    const v128 c = v128_load_unaligned(src + 1);
+    const v128 d = v128_load_unaligned(src + 2);
+
+    calc_hdelta_hbd8(o, a, b, c, d, dst, strength, dmp);
+    src += sstride;
+    dst += dstride;
+  }
+}
+
 void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
                                   int sstride, int dstride, int x0, int y0,
                                   int sizex, int sizey, unsigned int strength,
@ -251,4 +332,20 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
        src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
  }
 }
+
+void SIMD_FUNC(aom_clpf_hblock_hbd)(const uint16_t *src, uint16_t *dst,
+                                    int sstride, int dstride, int x0, int y0,
+                                    int sizex, int sizey, unsigned int strength,
+                                    unsigned int dmp) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block width not 4 or 8
+    // * block heights not a multiple of 2 if the block width is 4
+    aom_clpf_hblock_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
+                          strength, dmp);
+  } else {
+    (sizex == 4 ? clpf_hblock_hbd4 : clpf_hblock_hbd)(
+        src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
+  }
+}
 #endif
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@ -288,9 +288,10 @@ void od_dering(uint16_t *y, uint16_t *in, int xdec,
    by = dlist[bi].by;
    bx = dlist[bi].bx;

-    aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)],
-                       OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize,
-                       1 << bsize, 1 << bsize, clpf_strength << coeff_shift,
-                       clpf_damping + coeff_shift);
+    (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
+                                                    : aom_clpf_hblock_hbd)(
+        in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE,
+        1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize,
+        clpf_strength << coeff_shift, clpf_damping + coeff_shift);
  }
 }