New CLPF: New kernel and RDO for strength and block size

Change-Id: I61eb08862a101df74a6b65ece459833401e81117
2016-05-06 13:48:20 +02:00 · 2016-05-06 13:48:20 +02:00 · 7560123c06
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@ -65,8 +65,10 @@ AV1_COMMON_SRCS-yes += common/scan.h
 # TODO(angiebird) the forward transform belongs under encoder/
 AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h
 AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c
+ifeq ($(CONFIG_CLPF),yes)
 AV1_COMMON_SRCS-yes += common/clpf.c
 AV1_COMMON_SRCS-yes += common/clpf.h
+endif
 ifeq ($(CONFIG_DERING),yes)
 AV1_COMMON_SRCS-yes += common/od_dering.c
 AV1_COMMON_SRCS-yes += common/od_dering.h
--- a/av1/av1_cx.mk
+++ b/av1/av1_cx.mk
@ -82,6 +82,10 @@ AV1_CX_SRCS-yes += encoder/temporal_filter.h
 AV1_CX_SRCS-yes += encoder/mbgraph.c
 AV1_CX_SRCS-yes += encoder/mbgraph.h
 AV1_CX_SRCS-$(CONFIG_DERING) += encoder/pickdering.c
+ifeq ($(CONFIG_CLPF),yes)
+AV1_CX_SRCS-yes += encoder/clpf_rdo.c
+AV1_CX_SRCS-yes += encoder/clpf_rdo.h
+endif

 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@ -9,96 +9,119 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include "av1/common/clpf.h"
+#include "aom_dsp/aom_dsp_common.h"

-// Apply the filter on a single block
-static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
-                       int dstride, int has_top, int has_left, int has_bottom,
-                       int has_right, int width, int height) {
+int av1_clpf_maxbits(const AV1_COMMON *cm) {
+  return get_msb(ALIGN_POWER_OF_TWO(cm->mi_cols * MI_BLOCK_SIZE,
+                                    cm->clpf_size + 4) *
+                     ALIGN_POWER_OF_TWO(cm->mi_rows * MI_BLOCK_SIZE,
+                                        cm->clpf_size + 4) >>
+                 (cm->clpf_size * 2 + 8)) +
+         1;
+}
+
+int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
+  int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
+              3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
+              clamp(E - X, -b, b) + 4 * clamp(F - X, -b, b);
+  return (8 + delta - (delta < 0)) >> 4;
+}
+
+static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
+                       int y0, int sizex, int sizey, int width, int height,
+                       unsigned int strength) {
  int x, y;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      int X = src[(y + 0) * sstride + x + 0];
-      int A = has_top ? src[(y - 1) * sstride + x + 0] : X;
-      int B = has_left ? src[(y + 0) * sstride + x - 1] : X;
-      int C = has_right ? src[(y + 0) * sstride + x + 1] : X;
-      int D = has_bottom ? src[(y + 1) * sstride + x + 0] : X;
-      int delta = ((A > X) + (B > X) + (C > X) + (D > X) > 2) -
-                  ((A < X) + (B < X) + (C < X) + (D < X) > 2);
-      dst[y * dstride + x] = X + delta;
+  for (y = y0; y < y0 + sizey; y++) {
+    for (x = x0; x < x0 + sizex; x++) {
+      int X = src[y * stride + x];
+      int A = src[AOMMAX(0, y - 1) * stride + x];
+      int B = src[y * stride + AOMMAX(0, x - 2)];
+      int C = src[y * stride + AOMMAX(0, x - 1)];
+      int D = src[y * stride + AOMMIN(width - 1, x + 1)];
+      int E = src[y * stride + AOMMIN(width - 1, x + 2)];
+      int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+      int delta;
+      delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
+      dst[y * stride + x] = X + delta;
    }
  }
 }

-#define BS MI_SIZE *MI_BLOCK_SIZE
+// Return number of filtered blocks
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
+                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   int enable_fb_flag, unsigned int strength,
+                   unsigned int fb_size_log2, uint8_t *blocks,
+                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                   const YV12_BUFFER_CONFIG *,
+                                   const AV1_COMMON *cm, int, int, int,
+                                   unsigned int, unsigned int, uint8_t *)) {
+  /* Constrained low-pass filter (CLPF) */
+  int c, k, l, m, n;
+  int width = rec->y_crop_width;
+  int height = rec->y_crop_height;
+  int xpos, ypos;
+  int stride_y = rec->y_stride;
+  int stride_c = rec->uv_stride;
+  const int bs = MI_BLOCK_SIZE;
+  int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
+  int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
+  int block_index = 0;

-// Iterate over blocks within a superblock
-static void av1_clpf_sb(const YV12_BUFFER_CONFIG *frame_buffer,
-                        const AV1_COMMON *cm, MACROBLOCKD *xd,
-                        MODE_INFO *const *mi_8x8, int xpos, int ypos) {
-  // Temporary buffer (to allow SIMD parallelism)
-  uint8_t buf_unaligned[BS * BS + 15];
-  uint8_t *buf = (uint8_t *)(((intptr_t)buf_unaligned + 15) & ~15);
-  int x, y, p;
-
-  for (p = 0; p < (CLPF_FILTER_ALL_PLANES ? MAX_MB_PLANE : 1); p++) {
-    for (y = 0; y < MI_BLOCK_SIZE && ypos + y < cm->mi_rows; y++) {
-      for (x = 0; x < MI_BLOCK_SIZE && xpos + x < cm->mi_cols; x++) {
-        const MB_MODE_INFO *mbmi =
-            &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi;
-
-        // Do not filter if there is no residual
-        if (!mbmi->skip) {
-          // Do not filter frame edges
-          int has_top = ypos + y > 0;
-          int has_left = xpos + x > 0;
-          int has_bottom = ypos + y < cm->mi_rows - 1;
-          int has_right = xpos + x < cm->mi_cols - 1;
-#if CLPF_ALLOW_BLOCK_PARALLELISM
-          // Do not filter superblock edges
-          has_top &= !!y;
-          has_left &= !!x;
-          has_bottom &= y != MI_BLOCK_SIZE - 1;
-          has_right &= x != MI_BLOCK_SIZE - 1;
-#endif
-          av1_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x);
-          clpf_block(
-              xd->plane[p].dst.buf, CLPF_ALLOW_PIXEL_PARALLELISM
-                                        ? buf + y * MI_SIZE * BS + x * MI_SIZE
-                                        : xd->plane[p].dst.buf,
-              xd->plane[p].dst.stride,
-              CLPF_ALLOW_PIXEL_PARALLELISM ? BS : xd->plane[p].dst.stride,
-              has_top, has_left, has_bottom, has_right,
-              MI_SIZE >> xd->plane[p].subsampling_x,
-              MI_SIZE >> xd->plane[p].subsampling_y);
+  // Iterate over all filter blocks
+  for (k = 0; k < num_fb_ver; k++) {
+    for (l = 0; l < num_fb_hor; l++) {
+      int h, w;
+      int allskip = 1;
+      for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
+        for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
+          xpos = (l << fb_size_log2) + n * bs;
+          ypos = (k << fb_size_log2) + m * bs;
+          if (xpos < width && ypos < height) {
+            allskip &=
+                cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
+                    ->mbmi.skip;
+          }
        }
      }
-    }
-#if CLPF_ALLOW_PIXEL_PARALLELISM
-    for (y = 0; y < MI_BLOCK_SIZE && ypos + y < cm->mi_rows; y++) {
-      for (x = 0; x < MI_BLOCK_SIZE && xpos + x < cm->mi_cols; x++) {
-        const MB_MODE_INFO *mbmi =
-            &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi;
-        av1_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x);
-        if (!mbmi->skip) {
-          int i = 0;
-          for (i = 0; i<MI_SIZE>> xd->plane[p].subsampling_y; i++)
-            memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
-                   buf + (y * MI_SIZE + i) * BS + x * MI_SIZE,
-                   MI_SIZE >> xd->plane[p].subsampling_x);
+
+      // Calculate the actual filter block size near frame edges
+      h = AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+      w = AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+      h += !h << fb_size_log2;
+      w += !w << fb_size_log2;
+      if (!allskip &&  // Do not filter the block if all is skip encoded
+          (!enable_fb_flag ||
+           decision(k, l, rec, org, cm, bs, w / bs, h / bs, strength,
+                    fb_size_log2, blocks + block_index))) {
+        // Iterate over all smaller blocks inside the filter block
+        for (m = 0; m < (h + bs - 1) / bs; m++) {
+          for (n = 0; n < (w + bs - 1) / bs; n++) {
+            xpos = (l << fb_size_log2) + n * bs;
+            ypos = (k << fb_size_log2) + m * bs;
+            if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
+                     ->mbmi.skip) {
+              // Not skip block, apply the filter
+              clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, bs,
+                         bs, width, height, strength);
+            } else {  // Skip block, copy instead
+              for (c = 0; c < bs; c++)
+                *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
+                    *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+            }
+          }
        }
+      } else {  // Entire filter block is skip, copy
+        for (m = 0; m < h; m++)
+          memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
+                     (l << fb_size_log2),
+                 rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
+                     (l << fb_size_log2),
+                 w);
      }
+      block_index += !allskip;  // Count number of blocks filtered
    }
-#endif
  }
-}

-// Iterate over the superblocks of an entire frame
-void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm,
-                    MACROBLOCKD *xd) {
-  int x, y;
-
-  for (y = 0; y < cm->mi_rows; y += MI_BLOCK_SIZE)
-    for (x = 0; x < cm->mi_cols; x += MI_BLOCK_SIZE)
-      av1_clpf_sb(frame, cm, xd, cm->mi_grid_visible, x, y);
+  return block_index;
 }
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@ -13,15 +13,17 @@

 #include "av1/common/reconinter.h"

-// Configuration
-#define CLPF_ALLOW_PIXEL_PARALLELISM \
-  1  // 1 = SIMD friendly (adds a buffer requirement)
-#define CLPF_ALLOW_BLOCK_PARALLELISM \
-  0  // 1 = MT friendly (degrades quality slighty)
-#define CLPF_FILTER_ALL_PLANES \
-  0  // 1 = filter both luma and chroma, 0 = filter only luma
+#define MAX_FB_SIZE 128

-void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm,
-                    MACROBLOCKD *xd);
+int av1_clpf_maxbits(const AV1_COMMON *cm);
+int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
+                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   int enable_fb_flag, unsigned int strength,
+                   unsigned int fb_size_log2, uint8_t *blocks,
+                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                   const YV12_BUFFER_CONFIG *,
+                                   const AV1_COMMON *cm, int, int, int,
+                                   unsigned int, unsigned int, uint8_t *));

 #endif
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@ -147,7 +147,10 @@ typedef struct AV1Common {
 #endif

 #if CONFIG_CLPF
-  int clpf;
+  int clpf_numblocks;
+  int clpf_size;
+  int clpf_strength;
+  uint8_t *clpf_blocks;
 #endif

  YV12_BUFFER_CONFIG *frame_to_show;
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@ -817,7 +817,26 @@ static void setup_loopfilter(struct loopfilter *lf,

 #if CONFIG_CLPF
 static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  cm->clpf = aom_rb_read_literal(rb, 1);
+  cm->clpf_blocks = 0;
+  cm->clpf_strength = aom_rb_read_literal(rb, 2);
+  if (cm->clpf_strength) {
+    cm->clpf_size = aom_rb_read_literal(rb, 2);
+    if (cm->clpf_size) {
+      int i;
+      cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm));
+      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks));
+      for (i = 0; i < cm->clpf_numblocks; i++) {
+        cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1);
+      }
+    }
+  }
+}
+
+static int clpf_bit(int k, int l, const YV12_BUFFER_CONFIG *rec,
+                    const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                    int block_size, int w, int h, unsigned int strength,
+                    unsigned int fb_size_log2, uint8_t *bit) {
+  return *bit;
 }
 #endif

@ -2240,8 +2259,22 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
  }

 #if CONFIG_CLPF
-  if (cm->clpf && !cm->skip_loop_filter)
-    av1_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+  if (cm->clpf_strength && !cm->skip_loop_filter) {
+    YV12_BUFFER_CONFIG dst;  // Buffer for the result
+
+    dst = pbi->cur_buf->buf;
+    CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
+
+    av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
+                   cm->clpf_strength + (cm->clpf_strength == 3),
+                   4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
+
+    // Copy result
+    memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
+           dst.y_height * dst.y_stride);
+    aom_free(dst.y_buffer);
+  }
+  if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
 #endif
 #if CONFIG_DERING
  if (cm->dering_level && !cm->skip_loop_filter) {
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@ -1091,7 +1091,22 @@ static void encode_loopfilter(struct loopfilter *lf,

 #if CONFIG_CLPF
 static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, cm->clpf, 1);
+  aom_wb_write_literal(wb, cm->clpf_strength, 2);
+  if (cm->clpf_strength) {
+    aom_wb_write_literal(wb, cm->clpf_size, 2);
+    if (cm->clpf_size) {
+      int i;
+      // TODO(stemidts): The number of bits to transmit could be
+      // implicitly deduced if transmitted after the filter block or
+      // after the frame (when it's known whether the block is all
+      // skip and implicitly unfiltered).  And the bits do not have
+      // 50% probability, so a more efficient coding is possible.
+      aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm));
+      for (i = 0; i < cm->clpf_numblocks; i++) {
+        aom_wb_write_literal(wb, cm->clpf_blocks[i], 1);
+      }
+    }
+  }
 }
 #endif

--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/clpf.h"
+#include "aom/aom_integer.h"
+#include "av1/common/quant_common.h"
+
+// Calculate the error of a filtered and unfiltered block
+static void detect_clpf(const uint8_t *rec, const uint8_t *org, int x0, int y0,
+                        int width, int height, int so, int stride, int *sum0,
+                        int *sum1, unsigned int strength) {
+  int x, y;
+  for (y = y0; y < y0 + 8; y++) {
+    for (x = x0; x < x0 + 8; x++) {
+      int O = org[y * so + x];
+      int X = rec[y * stride + x];
+      int A = rec[AOMMAX(0, y - 1) * stride + x];
+      int B = rec[y * stride + AOMMAX(0, x - 2)];
+      int C = rec[y * stride + AOMMAX(0, x - 1)];
+      int D = rec[y * stride + AOMMIN(width - 1, x + 1)];
+      int E = rec[y * stride + AOMMIN(width - 1, x + 2)];
+      int F = rec[AOMMIN(height - 1, y + 1) * stride + x];
+      int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
+      int Y = X + delta;
+      *sum0 += (O - X) * (O - X);
+      *sum1 += (O - Y) * (O - Y);
+    }
+  }
+}
+
+static void detect_multi_clpf(const uint8_t *rec, const uint8_t *org, int x0,
+                              int y0, int width, int height, int so, int stride,
+                              int *sum) {
+  int x, y;
+
+  for (y = y0; y < y0 + 8; y++) {
+    for (x = x0; x < x0 + 8; x++) {
+      int O = org[y * so + x];
+      int X = rec[y * stride + x];
+      int A = rec[AOMMAX(0, y - 1) * stride + x];
+      int B = rec[y * stride + AOMMAX(0, x - 2)];
+      int C = rec[y * stride + AOMMAX(0, x - 1)];
+      int D = rec[y * stride + AOMMIN(width - 1, x + 1)];
+      int E = rec[y * stride + AOMMIN(width - 1, x + 2)];
+      int F = rec[AOMMIN(height - 1, y + 1) * stride + x];
+      int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
+      int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
+      int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
+      int F1 = X + delta1;
+      int F2 = X + delta2;
+      int F3 = X + delta3;
+      sum[0] += (O - X) * (O - X);
+      sum[1] += (O - F1) * (O - F1);
+      sum[2] += (O - F2) * (O - F2);
+      sum[3] += (O - F3) * (O - F3);
+    }
+  }
+}
+
+int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
+                      const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                      int block_size, int w, int h, unsigned int strength,
+                      unsigned int fb_size_log2, uint8_t *res) {
+  int m, n, sum0 = 0, sum1 = 0;
+  for (m = 0; m < h; m++) {
+    for (n = 0; n < w; n++) {
+      int xpos = (l << fb_size_log2) + n * block_size;
+      int ypos = (k << fb_size_log2) + m * block_size;
+      const int bs = MI_BLOCK_SIZE;
+      if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
+               ->mbmi.skip)
+        detect_clpf(rec->y_buffer, org->y_buffer, xpos, ypos, rec->y_crop_width,
+                    rec->y_crop_height, org->y_stride, rec->y_stride, &sum0,
+                    &sum1, strength);
+    }
+  }
+  *res = sum1 < sum0;
+  return *res;
+}
+
+// Calculate the square error of all filter settings.  Result:
+// res[0][0]   : unfiltered
+// res[0][1-3] : strength=1,2,4, no signals
+// res[1][0]   : (bit count, fb size = 128)
+// res[1][1-3] : strength=1,2,4, fb size = 128
+// res[2][0]   : (bit count, fb size = 64)
+// res[2][1-3] : strength=1,2,4, fb size = 64
+// res[3][0]   : (bit count, fb size = 32)
+// res[3][1-3] : strength=1,2,4, fb size = 32
+static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
+                    const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                    unsigned int block_size, unsigned int fb_size_log2, int w,
+                    int h, int64_t res[4][4]) {
+  int i, m, n, filtered = 0;
+  int sum[4];
+  int bslog = get_msb(block_size);
+  sum[0] = sum[1] = sum[2] = sum[3] = 0;
+  if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
+    int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
+
+    fb_size_log2--;
+    w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
+    h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
+    w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
+    h2 = AOMMIN(h - (1 << (fb_size_log2 - bslog)), h >> 1);
+    i = get_msb(MAX_FB_SIZE) - fb_size_log2;
+    sum1 = res[i][1];
+    sum2 = res[i][2];
+    sum3 = res[i][3];
+    oldfiltered = res[i][0];
+    res[i][0] = 0;
+
+    filtered =
+        clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res);
+    if (1 << (fb_size_log2 - bslog) < w)
+      filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
+                           fb_size_log2, w2, h1, res);
+    if (1 << (fb_size_log2 - bslog) < h) {
+      filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
+                           fb_size_log2, w1, h2, res);
+      filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2),
+                           rec, org, cm, block_size, fb_size_log2, w2, h2, res);
+    }
+
+    res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
+    res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
+    res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
+    res[i][0] = oldfiltered + filtered;  // Number of signal bits
+    return filtered;
+  }
+
+  for (m = 0; m < h; m++) {
+    for (n = 0; n < w; n++) {
+      int xpos = x + n * block_size;
+      int ypos = y + m * block_size;
+      if (!cm->mi_grid_visible[ypos / MI_BLOCK_SIZE * cm->mi_stride +
+                               xpos / MI_BLOCK_SIZE]
+               ->mbmi.skip) {
+        detect_multi_clpf(rec->y_buffer, org->y_buffer, xpos, ypos,
+                          rec->y_crop_width, rec->y_crop_height, org->y_stride,
+                          rec->y_stride, sum);
+        filtered = 1;
+      }
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    res[i][0] += sum[0];
+    res[i][1] += sum[1];
+    res[i][2] += sum[2];
+    res[i][3] += sum[3];
+  }
+  return filtered;
+}
+
+void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
+                         const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                         int *best_strength, int *best_bs) {
+  int i, j, k, l;
+  int64_t best, sums[4][4];
+  int width = rec->y_crop_width, height = rec->y_crop_height;
+  const int bs = MI_BLOCK_SIZE;
+  int fb_size_log2 = get_msb(MAX_FB_SIZE);
+  int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
+  int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
+
+  memset(sums, 0, sizeof(sums));
+
+  for (k = 0; k < num_fb_ver; k++) {
+    for (l = 0; l < num_fb_hor; l++) {
+      // Calculate the block size after frame border clipping
+      int h =
+          AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+      int w =
+          AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
+      h += !h << fb_size_log2;
+      w += !w << fb_size_log2;
+      clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs,
+               fb_size_log2, w / bs, h / bs, sums);
+    }
+  }
+
+  for (j = 0; j < 4; j++) {
+    static const double lambda_square[] = {
+      // exp((i - 15.4244) / 8.4010)
+      0.159451, 0.179607, 0.202310, 0.227884, 0.256690, 0.289138, 0.325687,
+      0.366856, 0.413230, 0.465465, 0.524303, 0.590579, 0.665233, 0.749323,
+      0.844044, 0.950737, 1.070917, 1.206289, 1.358774, 1.530533, 1.724004,
+      1.941931, 2.187406, 2.463911, 2.775368, 3.126195, 3.521370, 3.966498,
+      4.467893, 5.032669, 5.668837, 6.385421, 7.192586, 8.101784, 9.125911,
+      10.27949, 11.57890, 13.04256, 14.69124, 16.54832, 18.64016, 20.99641,
+      23.65052, 26.64013, 30.00764, 33.80084, 38.07352, 42.88630, 48.30746,
+      54.41389, 61.29221, 69.04002, 77.76720, 87.59756, 98.67056, 111.1432,
+      125.1926, 141.0179, 158.8436, 178.9227, 201.5399, 227.0160, 255.7126,
+      288.0366
+    };
+
+    // Estimate the bit costs and adjust the square errors
+    double lambda =
+        lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
+    int i, cost = (int)((1.2 * lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5));
+    for (i = 0; i < 4; i++)
+      sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
+  }
+
+  best = (int64_t)1 << 62;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      if ((!i || j) && sums[i][j] < best) best = sums[i][j];
+  best &= 15;
+  *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
+  *best_strength = best ? 1 << ((best - 1) & 3) : 0;
+}
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_CLPF_H_
+#define AV1_ENCODER_CLPF_H_
+
+#include "av1/common/reconinter.h"
+
+int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
+                      const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                      int block_size, int w, int h, unsigned int strength,
+                      unsigned int fb_size_log2, uint8_t *res);
+
+void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
+                         const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                         int *best_strength, int *best_bs);
+
+#endif
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@ -18,6 +18,7 @@
 #include "av1/common/alloccommon.h"
 #if CONFIG_CLPF
 #include "av1/common/clpf.h"
+#include "av1/encoder/clpf_rdo.h"
 #endif
 #if CONFIG_DERING
 #include "av1/common/dering.h"
@ -2478,6 +2479,47 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
  }

+#if CONFIG_CLPF
+  cm->clpf_strength = 0;
+  cm->clpf_size = 2;
+  CHECK_MEM_ERROR(
+      cm, cm->clpf_blocks,
+      aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) *
+                     ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
+                 10));
+  if (!is_lossless_requested(&cpi->oxcf)) {
+    // Test CLPF
+    int i, hq = 1;
+    // TODO(yaowu): investigate per-segment CLPF decision and
+    // an optimal threshold, use 80 for now.
+    for (i = 0; i < MAX_SEGMENTS; i++)
+      hq &= av1_get_qindex(&cm->seg, i, cm->base_qindex) < 80;
+
+    // Don't try filter if the entire image is nearly losslessly encoded
+    if (!hq) {
+      // Find the best strength and block size for the entire frame
+      int fb_size_log2, strength;
+      av1_clpf_test_frame(&cpi->last_frame_uf, cpi->Source, cm, &strength,
+                          &fb_size_log2);
+
+      if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
+
+      if (!strength) {  // Better to disable for the whole frame?
+        cm->clpf_strength = 0;
+      } else {
+        // Apply the filter using the chosen strength
+        cm->clpf_strength = strength - (strength == 4);
+        cm->clpf_size =
+            fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
+        aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
+        cm->clpf_numblocks =
+            av1_clpf_frame(cm->frame_to_show, &cpi->last_frame_uf, cpi->Source,
+                           cm, !!cm->clpf_size, strength, 4 + cm->clpf_size,
+                           cm->clpf_blocks, av1_clpf_decision);
+      }
+    }
+  }
+#endif
 #if CONFIG_DERING
  if (is_lossless_requested(&cpi->oxcf)) {
    cm->dering_level = 0;
@ -2488,65 +2530,6 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
  }
 #endif  // CONFIG_DERING

-#if CONFIG_CLPF
-  cm->clpf = 0;
-  if (!is_lossless_requested(&cpi->oxcf)) {
-    // Test CLPF
-    int i, hq = 1;
-    uint64_t before, after;
-    // TODO(yaowu): investigate per-segment CLPF decision and
-    // an optimal threshold, use 80 for now.
-    for (i = 0; i < MAX_SEGMENTS; i++)
-      hq &= av1_get_qindex(&cm->seg, i, cm->base_qindex) < 80;
-
-    if (!hq) {  // Don't try filter if the entire image is nearly losslessly
-                // encoded
-#if CLPF_FILTER_ALL_PLANES
-      aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
-      before =
-          get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
-                  cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-                  cpi->Source->y_crop_width, cpi->Source->y_crop_height) +
-          get_sse(cpi->Source->u_buffer, cpi->Source->uv_stride,
-                  cm->frame_to_show->u_buffer, cm->frame_to_show->uv_stride,
-                  cpi->Source->uv_crop_width, cpi->Source->uv_crop_height) +
-          get_sse(cpi->Source->v_buffer, cpi->Source->uv_stride,
-                  cm->frame_to_show->v_buffer, cm->frame_to_show->uv_stride,
-                  cpi->Source->uv_crop_width, cpi->Source->uv_crop_height);
-      av1_clpf_frame(cm->frame_to_show, cm, xd);
-      after = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
-                      cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-                      cpi->Source->y_crop_width, cpi->Source->y_crop_height) +
-              get_sse(cpi->Source->u_buffer, cpi->Source->uv_stride,
-                      cm->frame_to_show->u_buffer, cm->frame_to_show->uv_stride,
-                      cpi->Source->uv_crop_width, cpi->Source->uv_crop_height) +
-              get_sse(cpi->Source->v_buffer, cpi->Source->uv_stride,
-                      cm->frame_to_show->v_buffer, cm->frame_to_show->uv_stride,
-                      cpi->Source->uv_crop_width, cpi->Source->uv_crop_height);
-#else
-      aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-      before = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
-                       cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-                       cpi->Source->y_crop_width, cpi->Source->y_crop_height);
-      av1_clpf_frame(cm->frame_to_show, cm, xd);
-      after = get_sse(cpi->Source->y_buffer, cpi->Source->y_stride,
-                      cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-                      cpi->Source->y_crop_width, cpi->Source->y_crop_height);
-#endif
-      if (before < after) {
-// No improvement, restore original
-#if CLPF_FILTER_ALL_PLANES
-        aom_yv12_copy_frame(&cpi->last_frame_uf, cm->frame_to_show);
-#else
-        aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-#endif
-      } else {
-        cm->clpf = 1;
-      }
-    }
-  }
-#endif
-
  aom_extend_frame_inner_borders(cm->frame_to_show);
 }

@ -3649,6 +3632,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
  }
 #endif  // CONFIG_EXT_REFS

+#if CONFIG_CLPF
+  aom_free(cm->clpf_blocks);
+#endif
+
  if (cm->seg.update_map) update_reference_segmentation_map(cpi);

  if (frame_is_intra_only(cm) == 0) {