CDEF encode buffering optimizations

Change-Id: I6b178d5ebf353bca98f18d8add2aa8b77e03cc4f
2017-03-24 15:20:05 -04:00 · 2017-03-24 15:20:05 -04:00 · deb1950bb3
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@ -92,6 +92,19 @@ static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride,
  return sum / (double)(1 << 2 * coeff_shift);
 }

+/* FIXME: SSE-optimize this. */
+static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+                         int src_voffset, int src_hoffset, int sstride,
+                         int vsize, int hsize) {
+  int r, c;
+  const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++) {
+    for (c = 0; c < hsize; c++) {
+      dst[r * dstride + c] = base[r * sstride + c];
+    }
+  }
+}
+
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                     AV1_COMMON *cm, MACROBLOCKD *xd) {
  int r, c;
@ -181,7 +194,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      int nvb, nhb;
      int gi;
      int dirinit = 0;
-      DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
+      DECLARE_ALIGNED(32, uint16_t,
+                      dst[3][MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
      DECLARE_ALIGNED(32, uint16_t,
                      tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
@ -189,24 +203,23 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
                                            sbc * MAX_MIB_SIZE, dlist);
      if (dering_count == 0) continue;
+      for (pli = 0; pli < nplanes; pli++) {
+        /* Copy the dst buffer only once since it will always be written at
+           the same place. */
+        copy_sb16_16(dst[pli], MAX_MIB_SIZE << bsize[pli], src[pli],
+                     sbr * MAX_MIB_SIZE << bsize[pli],
+                     sbc * MAX_MIB_SIZE << bsize[pli], stride[pli],
+                     nvb << bsize[pli], nhb << bsize[pli]);
+      }
      for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
        int threshold;
        int clpf_strength;
        DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
        uint16_t *in;
-        int j;
        level = dering_level_table[gi / CLPF_STRENGTHS];
        threshold = level << coeff_shift;
        for (pli = 0; pli < nplanes; pli++) {
          if (pli > 0 && !chroma_dering) threshold = 0;
-          for (r = 0; r < nvb << bsize[pli]; r++) {
-            for (c = 0; c < nhb << bsize[pli]; c++) {
-              dst[(r * MAX_MIB_SIZE << bsize[pli]) + c] =
-                  src[pli]
-                     [((sbr * MAX_MIB_SIZE << bsize[pli]) + r) * stride[pli] +
-                      (sbc * MAX_MIB_SIZE << bsize[pli]) + c];
-            }
-          }
          in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
          /* We avoid filtering the pixels for which some of the pixels to
             average
@ -214,27 +227,25 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
             would add special cases for any future vectorization. */
          for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
            inbuf[i] = OD_DERING_VERY_LARGE;
-          for (i = -OD_FILT_VBORDER * (sbr != 0);
-               i < (nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1);
-               i++) {
-            for (j = -OD_FILT_HBORDER * (sbc != 0);
-                 j < (nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
-                 j++) {
-              uint16_t *x;
-              x = &src[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
-                            (sbc * MAX_MIB_SIZE << bsize[pli])];
-              in[i * OD_FILT_BSTRIDE + j] = x[i * stride[pli] + j];
-            }
-          }
+          int yoff = OD_FILT_VBORDER * (sbr != 0);
+          int xoff = OD_FILT_HBORDER * (sbc != 0);
+          int ysize =
+              (nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
+          int xsize =
+              (nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
+          copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
+                       src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff,
+                       (sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli],
+                       ysize, xsize);
          clpf_strength = gi % CLPF_STRENGTHS;
          od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
                    dering_count, threshold,
                    clpf_strength + (clpf_strength == 3), clpf_damping,
                    coeff_shift);
-          copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
-                                     dlist, dering_count, bsize[pli]);
+          copy_dering_16bit_to_16bit(dst[pli], MAX_MIB_SIZE << bsize[pli],
+                                     tmp_dst, dlist, dering_count, bsize[pli]);
          mse[pli][sb_count][gi] = (int)compute_dist(
-              dst, MAX_MIB_SIZE << bsize[pli],
+              dst[pli], MAX_MIB_SIZE << bsize[pli],
              &ref_coeff[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
                              (sbc * MAX_MIB_SIZE << bsize[pli])],
              stride[pli], nhb, nvb, coeff_shift, bsize[pli]);