Save on CDEF encoder buffers by computing MSE one plane at a time

Change-Id: I17d88e565a8f9908a5bad92c26b7a54fe932fea3
2017-03-24 19:12:19 -04:00 · 2017-03-24 19:12:19 -04:00 · 32282f2c41
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@ -137,6 +137,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
  int quantizer;
  double lambda;
  int nplanes = 3;
+  DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
+  uint16_t *in;
+  DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
  int chroma_dering =
      xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
@ -188,16 +192,13 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      }
    }
  }
+  in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
  sb_count = 0;
  for (sbr = 0; sbr < nvsb; sbr++) {
    for (sbc = 0; sbc < nhsb; sbc++) {
      int nvb, nhb;
      int gi;
      int dirinit = 0;
-      DECLARE_ALIGNED(32, uint16_t,
-                      dst[3][MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
-      DECLARE_ALIGNED(32, uint16_t,
-                      tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
@ -206,27 +207,22 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      for (pli = 0; pli < nplanes; pli++) {
        /* Copy the dst buffer only once since it will always be written at
           the same place. */
-        copy_sb16_16(dst[pli], MAX_MIB_SIZE << bsize[pli], src[pli],
+        copy_sb16_16(dst, MAX_MIB_SIZE << bsize[pli], src[pli],
                     sbr * MAX_MIB_SIZE << bsize[pli],
                     sbc * MAX_MIB_SIZE << bsize[pli], stride[pli],
                     nvb << bsize[pli], nhb << bsize[pli]);
-      }
-      for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
-        int threshold;
-        int clpf_strength;
-        DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
-        uint16_t *in;
-        level = dering_level_table[gi / CLPF_STRENGTHS];
-        threshold = level << coeff_shift;
-        for (pli = 0; pli < nplanes; pli++) {
+        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
+          inbuf[i] = OD_DERING_VERY_LARGE;
+        for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
+          int threshold;
+          int clpf_strength;
+          level = dering_level_table[gi / CLPF_STRENGTHS];
+          threshold = level << coeff_shift;
          if (pli > 0 && !chroma_dering) threshold = 0;
-          in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
          /* We avoid filtering the pixels for which some of the pixels to
             average
             are outside the frame. We could change the filter instead, but it
             would add special cases for any future vectorization. */
-          for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
-            inbuf[i] = OD_DERING_VERY_LARGE;
          int yoff = OD_FILT_VBORDER * (sbr != 0);
          int xoff = OD_FILT_HBORDER * (sbc != 0);
          int ysize =
@ -242,10 +238,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                    dering_count, threshold,
                    clpf_strength + (clpf_strength == 3), clpf_damping,
                    coeff_shift);
-          copy_dering_16bit_to_16bit(dst[pli], MAX_MIB_SIZE << bsize[pli],
-                                     tmp_dst, dlist, dering_count, bsize[pli]);
+          copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
+                                     dlist, dering_count, bsize[pli]);
          mse[pli][sb_count][gi] = (int)compute_dist(
-              dst[pli], MAX_MIB_SIZE << bsize[pli],
+              dst, MAX_MIB_SIZE << bsize[pli],
              &ref_coeff[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
                              (sbc * MAX_MIB_SIZE << bsize[pli])],
              stride[pli], nhb, nvb, coeff_shift, bsize[pli]);