From 51b7a998077d56cca5b2171f1dda99f0cc43c7a0 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Wed, 24 Feb 2016 19:36:34 -0500
Subject: [PATCH] Adds the Daala deringing filter as experimental

The deringing filter documentation is in:
J.-M. Valin, The Daala Directional Deringing Filter, arXiv:1602.05975 [cs.MM],
2016.
http://arxiv.org/pdf/1602.05975.pdf

ntt-short1 results:
          MEDIUM (%) HIGH (%)
    PSNR  -2.488088 -2.003235
 PSNRHVS  -1.588932 -0.797850
    SSIM  -1.522767 -1.287861
FASTSSIM   4.307822  3.983496

subset1 improvement is around 2-3% on PSNR (but the rate is mostly outside
the AWCY testing range)

Change-Id: Ic02344ce9faa509f5c3a50a0fb7a7b84a7977e72
---
 configure                  |   1 +
 vp10/common/blockd.h       |   2 +
 vp10/common/dering.c       | 153 +++++++++++++++++
 vp10/common/dering.h       |  31 ++++
 vp10/common/od_dering.c    | 343 +++++++++++++++++++++++++++++++++++++
 vp10/common/od_dering.h    |  83 +++++++++
 vp10/common/odintrin.h     |  29 ++++
 vp10/common/onyxc_int.h    |   3 +
 vp10/decoder/decodeframe.c |  29 ++++
 vp10/encoder/bitstream.c   |  21 +++
 vp10/encoder/encoder.c     |  13 ++
 vp10/encoder/pickdering.c  | 180 +++++++++++++++++++
 vp10/vp10_common.mk        |   4 +
 vp10/vp10cx.mk             |   1 +
 14 files changed, 893 insertions(+)
 create mode 100644 vp10/common/dering.c
 create mode 100644 vp10/common/dering.h
 create mode 100644 vp10/common/od_dering.c
 create mode 100644 vp10/common/od_dering.h
 create mode 100644 vp10/common/odintrin.h
 create mode 100644 vp10/encoder/pickdering.c

diff --git a/configure b/configure
index 5f8e2e32a..22d13d796 100755
--- a/configure
+++ b/configure
@@ -251,6 +251,7 @@ EXPERIMENT_LIST="
     emulate_hardware
     misc_fixes
     clpf
+    dering
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 213b27ae0..c60453d5d 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -88,6 +88,8 @@ typedef struct {
 
   // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
   int_mv mv[2];
+  /* deringing gain *per-superblock* */
+  int8_t dering_gain;
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
diff --git a/vp10/common/dering.c b/vp10/common/dering.c
new file mode 100644
index 000000000..2d5f1a725
--- /dev/null
+++ b/vp10/common/dering.c
@@ -0,0 +1,153 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include <math.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/od_dering.h"
+
+
+int compute_level_from_index(int global_level, int gi) {
+  static const double dering_gains[4] = {0, .7, 1, 1.4};
+  int level;
+  if (global_level == 0) return 0;
+  level = (int)floor(.5 + global_level*dering_gains[gi]);
+  return clamp(level, gi, MAX_DERING_LEVEL-1);
+}
+
+int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col) {
+  int r, c;
+  int maxc, maxr;
+  int skip = 1;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+  if (maxr > MI_BLOCK_SIZE) maxr = MI_BLOCK_SIZE;
+  if (maxc > MI_BLOCK_SIZE) maxc = MI_BLOCK_SIZE;
+  for (r = 0; r < maxr; r++) {
+    for (c = 0; c < maxc; c++) {
+      skip = skip &&
+          cm->mi_grid_visible[(mi_row + r)*cm->mi_stride + mi_col + c]->
+          mbmi.skip;
+    }
+  }
+  return skip;
+}
+
+void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
+                       MACROBLOCKD *xd, int global_level) {
+  int r, c;
+  int sbr, sbc;
+  int nhsb, nvsb;
+  dering_in *src[3];
+  unsigned char *bskip;
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
+  int stride;
+  int bsize[3];
+  int dec[3];
+  int pli;
+  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
+  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
+  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = 8 >> dec[pli];
+  }
+  stride = bsize[0]*cm->mi_cols;
+  for (pli = 0; pli < 3; pli++) {
+    src[pli] = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
+    for (r = 0; r < bsize[pli]*cm->mi_rows; ++r) {
+      for (c = 0; c < bsize[pli]*cm->mi_cols; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          src[pli][r * stride + c] =
+              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
+              [r * xd->plane[pli].dst.stride + c];
+        } else {
+#endif
+          src[pli][r * stride + c] =
+              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+        }
+#endif
+      }
+    }
+  }
+  for (r = 0; r < cm->mi_rows; ++r) {
+    for (c = 0; c < cm->mi_cols; ++c) {
+      const MB_MODE_INFO *mbmi =
+          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
+      bskip[r * cm->mi_cols + c] = mbmi->skip;
+    }
+  }
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int level;
+      int nhb, nvb;
+      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
+      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
+      for (pli = 0; pli < 3; pli++) {
+        int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
+        int threshold;
+#if DERING_REFINEMENT
+        level = compute_level_from_index(
+            global_level,
+            cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride +
+            MI_BLOCK_SIZE*sbc]->mbmi.dering_gain);
+#else
+        level = global_level;
+#endif
+        /* FIXME: This is a temporary hack that uses more conservative
+           deringing for chroma. */
+        if (pli) level = level*2/3;
+        if (sb_all_skip(cm, sbr*MI_BLOCK_SIZE, sbc*MI_BLOCK_SIZE)) level = 0;
+        threshold = level << coeff_shift;
+        od_dering(
+            &OD_DERING_VTBL_C,
+            dst,
+            MI_BLOCK_SIZE*bsize[pli],
+            &src[pli][sbr*stride*bsize[pli]*MI_BLOCK_SIZE +
+            sbc*bsize[pli]*MI_BLOCK_SIZE],
+            stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
+            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
+            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+        for (r = 0; r < bsize[pli]*nvb; ++r) {
+          for (c = 0; c < bsize[pli]*nhb; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+            if (cm->use_highbitdepth) {
+              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
+                  [xd->plane[pli].dst.stride*(bsize[pli]*MI_BLOCK_SIZE*sbr + r)
+                  + sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
+                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
+            } else {
+#endif
+              xd->plane[pli].dst.buf[xd->plane[pli].dst.stride*
+                  (bsize[pli]*MI_BLOCK_SIZE*sbr + r) +
+                  sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
+                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+            }
+#endif
+          }
+        }
+      }
+    }
+  }
+  for (pli = 0; pli < 3; pli++) {
+    vpx_free(src[pli]);
+  }
+  vpx_free(bskip);
+}
diff --git a/vp10/common/dering.h b/vp10/common/dering.h
new file mode 100644
index 000000000..9a292cb28
--- /dev/null
+++ b/vp10/common/dering.h
@@ -0,0 +1,31 @@
+#ifndef VP10_COMMON_DERING_H_
+#define VP10_COMMON_DERING_H_
+
+#include "vp10/common/od_dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DERING_LEVEL_BITS 6
+#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
+
+#define DERING_REFINEMENT 1
+
+int compute_level_from_index(int global_level, int gi);
+int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col);
+void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
+                       MACROBLOCKD *xd, int global_level);
+
+int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                      VP10_COMMON *cm,
+                      MACROBLOCKD *xd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VP10_COMMON_DERING_H_
diff --git a/vp10/common/od_dering.c b/vp10/common/od_dering.c
new file mode 100644
index 000000000..5a6b21fa5
--- /dev/null
+++ b/vp10/common/od_dering.c
@@ -0,0 +1,343 @@
+/*Daala video codec
+Copyright (c) 2014-2016 Daala project contributors.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "dering.h"
+
+const od_dering_opt_vtbl OD_DERING_VTBL_C = {
+  {od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c},
+  {od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c}
+};
+
+/* Generated from gen_filter_tables.c. */
+const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
+  {-1*OD_FILT_BSTRIDE + 1, -2*OD_FILT_BSTRIDE + 2, -3*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1, -1*OD_FILT_BSTRIDE + 2, -1*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1,  0*OD_FILT_BSTRIDE + 2,  0*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1,  1*OD_FILT_BSTRIDE + 2,  1*OD_FILT_BSTRIDE + 3  },
+  { 1*OD_FILT_BSTRIDE + 1,  2*OD_FILT_BSTRIDE + 2,  3*OD_FILT_BSTRIDE + 3  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 1,  3*OD_FILT_BSTRIDE + 1  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 0,  3*OD_FILT_BSTRIDE + 0  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE - 1,  3*OD_FILT_BSTRIDE - 1  },
+};
+
+const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = {
+  0, 0.5, 0.707, 1, 1.41, 2
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+static int od_dir_find8(const dering_in *img, int stride, int32_t *var,
+    int coeff_shift) {
+  int i;
+  int cost[8] = {0};
+  int partial[8][15] = {{0}};
+  int best_cost = 0;
+  int best_dir = 0;
+  for (i = 0; i < 8; i++) {
+    int j;
+    for (j = 0; j < 8; j++) {
+      int x;
+      x = img[i*stride + j] >> coeff_shift;
+      partial[0][i + j] += x;
+      partial[1][i + j/2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j/2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i/2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i/2 + j] += x;
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    cost[2] += partial[2][i]*partial[2][i] >> 3;
+    cost[6] += partial[6][i]*partial[6][i] >> 3;
+  }
+  for (i = 0; i < 7; i++) {
+    cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
+     + OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
+    cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
+     + OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
+  }
+  cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
+  cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
+  for (i = 1; i < 8; i += 2) {
+    int j;
+    for (j = 0; j < 4 + 1; j++) {
+      cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
+    }
+    for (j = 0; j < 4 - 1; j++) {
+      cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
+       + OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  return best_dir;
+}
+
+#define OD_DERING_VERY_LARGE (30000)
+#define OD_DERING_INBUF_SIZE ((OD_BSIZE_MAX + 2*OD_FILT_BORDER)*\
+ (OD_BSIZE_MAX + 2*OD_FILT_BORDER))
+
+/* Smooth in the direction detected. */
+void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
+ int ln, int threshold, int dir) {
+  int i;
+  int j;
+  int k;
+  static const int taps[3] = {3, 2, 2};
+  for (i = 0; i < 1 << ln; i++) {
+    for (j = 0; j < 1 << ln; j++) {
+      int16_t sum;
+      int16_t xx;
+      int16_t yy;
+      xx = in[i*OD_FILT_BSTRIDE + j];
+      sum= 0;
+      for (k = 0; k < 3; k++) {
+        int16_t p0;
+        int16_t p1;
+        p0 = in[i*OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]]
+         - xx;
+        p1 = in[i*OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]]
+         - xx;
+        if (abs(p0) < threshold) sum += taps[k]*p0;
+        if (abs(p1) < threshold) sum += taps[k]*p1;
+      }
+      yy = xx + ((sum + 8) >> 4);
+      y[i*ystride + j] = yy;
+    }
+  }
+}
+
+void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir) {
+  od_filter_dering_direction_c(y, ystride, in, 2, threshold, dir);
+}
+
+void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir) {
+  od_filter_dering_direction_c(y, ystride, in, 3, threshold, dir);
+}
+
+/* Smooth in the direction orthogonal to what was detected. */
+void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
+ const dering_in *x, int xstride, int ln, int threshold, int dir) {
+  int i;
+  int j;
+  int offset;
+  if (dir <= 4) offset = OD_FILT_BSTRIDE;
+  else offset = 1;
+  for (i = 0; i < 1 << ln; i++) {
+    for (j = 0; j < 1 << ln; j++) {
+      int16_t athresh;
+      int16_t yy;
+      int16_t sum;
+      int16_t p;
+      /* Deringing orthogonal to the direction uses a tighter threshold
+         because we want to be conservative. We've presumably already
+         achieved some deringing, so the amount of change is expected
+         to be low. Also, since we might be filtering across an edge, we
+         want to make sure not to blur it. That being said, we might want
+         to be a little bit more aggressive on pure horizontal/vertical
+         since the ringing there tends to be directional, so it doesn't
+         get removed by the directional filtering. */
+      athresh = OD_MINI(threshold, threshold/3
+       + abs(in[i*OD_FILT_BSTRIDE + j] - x[i*xstride + j]));
+      yy = in[i*OD_FILT_BSTRIDE + j];
+      sum = 0;
+      p = in[i*OD_FILT_BSTRIDE + j + offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j - offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j + 2*offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j - 2*offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      y[i*ystride + j] = yy + ((3*sum + 8) >> 4);
+    }
+  }
+}
+
+void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
+  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 2, threshold, dir);
+}
+
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
+  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 3, threshold, dir);
+}
+
+/* This table approximates x^0.16 with the index being log2(x). It is clamped
+   to [-.5, 3]. The table is computed as:
+   round(256*min(3, max(.5, 1.08*(sqrt(2)*2.^([0:17]+8)/256/256).^.16))) */
+static const int16_t OD_THRESH_TABLE_Q8[18] = {
+  128, 134, 150, 168, 188, 210, 234, 262,
+  292, 327, 365, 408, 455, 509, 569, 635,
+  710, 768,
+};
+
+/* Compute deringing filter threshold for each 8x8 block based on the
+   directional variance difference. A high variance difference means that we
+   have a highly directional pattern (e.g. a high contrast edge), so we can
+   apply more deringing. A low variance means that we either have a low
+   contrast edge, or a non-directional texture, so we want to be careful not
+   to blur. */
+static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+ int threshold, int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+ int32_t sb_var, int nhb, int nvb) {
+  int bx;
+  int by;
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      int v1;
+      int v2;
+      /* We use both the variance of 8x8 blocks and the variance of the
+         entire superblock to determine the threshold. */
+      v1 = OD_MINI(32767, var[by][bx] >> 6);
+      v2 = OD_MINI(32767, sb_var/(OD_BSIZE_MAX*OD_BSIZE_MAX));
+      thresh[by][bx] = threshold*OD_THRESH_TABLE_Q8[OD_CLAMPI(0,
+       OD_ILOG(v1*v2) - 9, 17)] >> 8;
+    }
+  }
+}
+
+void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
+ const dering_in *x, int xstride, int nhb, int nvb, int sbx, int sby, int nhsb,
+ int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ unsigned char *bskip, int skip_stride, int threshold, int overlap,
+ int coeff_shift) {
+  int i;
+  int j;
+  int bx;
+  int by;
+  int16_t inbuf[OD_DERING_INBUF_SIZE];
+  int16_t *in;
+  int bsize;
+  int varsum = 0;
+  int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  bsize = 3 - xdec;
+  in = inbuf + OD_FILT_BORDER*OD_FILT_BSTRIDE + OD_FILT_BORDER;
+  /* We avoid filtering the pixels for which some of the pixels to average
+     are outside the frame. We could change the filter instead, but it would
+     add special cases for any future vectorization. */
+  for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
+  for (i = -OD_FILT_BORDER*(sby != 0); i < (nvb << bsize)
+   + OD_FILT_BORDER*(sby != nvsb - 1); i++) {
+    for (j = -OD_FILT_BORDER*(sbx != 0); j < (nhb << bsize)
+     + OD_FILT_BORDER*(sbx != nhsb - 1); j++) {
+      in[i*OD_FILT_BSTRIDE + j] = x[i*xstride + j];
+    }
+  }
+  if (pli == 0) {
+    for (by = 0; by < nvb; by++) {
+      for (bx = 0; bx < nhb; bx++) {
+        dir[by][bx] = od_dir_find8(&x[8*by*xstride + 8*bx], xstride,
+         &var[by][bx], coeff_shift);
+        varsum += var[by][bx];
+      }
+    }
+    od_compute_thresh(thresh, threshold, var, varsum, nhb, nvb);
+  }
+  else {
+    for (by = 0; by < nvb; by++) {
+      for (bx = 0; bx < nhb; bx++) {
+        thresh[by][bx] = threshold;
+      }
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      int skip;
+# if defined(DAALA_ODINTRIN)
+      int xstart;
+      int ystart;
+      int xend;
+      int yend;
+      xstart = ystart = 0;
+      xend = yend = (2 >> xdec);
+      if (overlap) {
+        xstart -= (sbx != 0);
+        ystart -= (sby != 0);
+        xend += (sbx != nhsb - 1);
+        yend += (sby != nvsb - 1);
+      }
+      skip = 1;
+      /* We look at whether the current block and its 4x4 surrounding (due to
+         lapping) are skipped to avoid filtering the same content multiple
+         times. */
+      for (i = ystart; i < yend; i++) {
+        for (j = xstart; j < xend; j++) {
+          skip = skip && bskip[((by << 1 >> xdec) + i)*skip_stride
+           + (bx << 1 >> xdec) + j];
+        }
+      }
+#else
+      skip = bskip[by*skip_stride + bx];
+#endif
+      if (skip) thresh[by][bx] = 0;
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
+       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+       thresh[by][bx], dir[by][bx]);
+    }
+  }
+  for (i = 0; i < nvb << bsize; i++) {
+    for (j = 0; j < nhb << bsize; j++) {
+      in[i*OD_FILT_BSTRIDE + j] = y[i*ystride + j];
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
+       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+       &x[(by*xstride << bsize) + (bx << bsize)], xstride,
+       thresh[by][bx], dir[by][bx]);
+    }
+  }
+}
diff --git a/vp10/common/od_dering.h b/vp10/common/od_dering.h
new file mode 100644
index 000000000..1a77ce0ba
--- /dev/null
+++ b/vp10/common/od_dering.h
@@ -0,0 +1,83 @@
+/*Daala video codec
+Copyright (c) 2003-2010 Daala project contributors.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#if !defined(_dering_H)
+# define _dering_H (1)
+
+# include "odintrin.h"
+
+# if defined(DAALA_ODINTRIN)
+#  include "filter.h"
+typedef int16_t dering_in;
+# endif
+
+#define OD_DERINGSIZES (2)
+
+#define OD_DERING_NO_CHECK_OVERLAP (0)
+#define OD_DERING_CHECK_OVERLAP (1)
+
+#define OD_DERING_LEVELS (6)
+extern const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS];
+
+#define OD_DERING_NBLOCKS (OD_BSIZE_MAX/8)
+
+#define OD_FILT_BORDER (3)
+#define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2*OD_FILT_BORDER)
+
+extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
+
+typedef void (*od_filter_dering_direction_func)(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+
+struct od_dering_opt_vtbl {
+  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
+  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
+};
+typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
+
+
+void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
+ const dering_in *x, int xstride, int nvb, int nhb, int sbx, int sby, int nhsb,
+ int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ unsigned char *bskip, int skip_stride, int threshold, int overlap,
+ int coeff_shift);
+void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
+ int ln, int threshold, int dir);
+void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
+ const dering_in *x, int xstride, int ln, int threshold, int dir);
+
+extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
+
+void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+
+#endif
diff --git a/vp10/common/odintrin.h b/vp10/common/odintrin.h
new file mode 100644
index 000000000..24dd78f5b
--- /dev/null
+++ b/vp10/common/odintrin.h
@@ -0,0 +1,29 @@
+#include "vp10/common/enums.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/bitops.h"
+
+/*Smallest blocks are 4x4*/
+# define OD_LOG_BSIZE0 (2)
+/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
+# define OD_NBSIZES    (5)
+/*The log of the maximum length of the side of a block.*/
+# define OD_LOG_BSIZE_MAX (OD_LOG_BSIZE0 + OD_NBSIZES - 1)
+/*The maximum length of the side of a block.*/
+# define OD_BSIZE_MAX     (1 << OD_LOG_BSIZE_MAX)
+
+typedef int od_coeff;
+
+typedef int16_t dering_in;
+
+#define OD_DIVU_SMALL(_x, _d) ((_x) / (_d))
+
+#define OD_MINI VPXMIN
+#define OD_CLAMPI(min, val, max) clamp((val), (min), (max))
+
+#  define OD_ILOG_NZ(x) get_msb(x)
+/*Note that __builtin_clz is not defined when x == 0, according to the gcc
+ *    documentation (and that of the x86 BSR instruction that implements it), so
+ *       we have to special-case it.
+ *         We define a special version of the macro to use when x can be zero.*/
+#  define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index 63cbf1942..c8cfe92e1 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -312,6 +312,9 @@ typedef struct VP10Common {
   // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
   // each keyframe and not used afterwards
   vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_DERING
+  int dering_level;
+#endif
 } VP10_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 5bc5a7a29..c2513379a 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -14,6 +14,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_config.h"
 
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/bitreader.h"
@@ -29,6 +30,9 @@
 #include "vp10/common/clpf.h"
 #endif
 #include "vp10/common/common.h"
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/idct.h"
@@ -967,6 +971,17 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+
+#if DERING_REFINEMENT
+  if (bsize == BLOCK_64X64) {
+    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
+      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain =
+          vpx_read_literal(r, 2);
+    } else {
+      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain = 0;
+    }
+  }
+#endif
 }
 
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1103,6 +1118,12 @@ static void setup_clpf(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
 }
 #endif
 
+#if CONFIG_DERING
+static void setup_dering(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  cm->dering_level = vpx_rb_read_literal(rb,  DERING_LEVEL_BITS);
+}
+#endif  // CONFIG_DERING
+
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
   return vpx_rb_read_bit(rb)
              ? vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4)
@@ -1603,6 +1624,11 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi, const uint8_t *data,
   if (cm->clpf && !cm->skip_loop_filter)
     vp10_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
 #endif
+#if CONFIG_DERING
+  if (cm->dering_level && !cm->skip_loop_filter) {
+    vp10_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
+  }
+#endif  // CONFIG_DERING
 
   // Get last tile data.
   tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
@@ -2088,6 +2114,9 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
   setup_loopfilter(&cm->lf, rb);
 #if CONFIG_CLPF
   setup_clpf(cm, rb);
+#endif
+#if CONFIG_DERING
+  setup_dering(cm, rb);
 #endif
   setup_quantization(cm, rb);
 #if CONFIG_VPX_HIGHBITDEPTH
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index ddfa32a98..471d39cf2 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -22,6 +22,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/entropymv.h"
@@ -602,6 +605,15 @@ static void write_modes_sb(VP10_COMP *cpi, const TileInfo *const tile,
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+
+#if DERING_REFINEMENT
+  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
+      !sb_all_skip(cm, mi_row, mi_col)) {
+    vpx_write_literal(
+        w, cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain,
+        2);
+  }
+#endif
 }
 
 static void write_modes(VP10_COMP *cpi, const TileInfo *const tile,
@@ -848,6 +860,12 @@ static void encode_clpf(const VP10_COMMON *cm,
 }
 #endif
 
+#if CONFIG_DERING
+static void encode_dering(int level, struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, level, DERING_LEVEL_BITS);
+}
+#endif  // CONFIG_DERING
+
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
     vpx_wb_write_bit(wb, 1);
@@ -1317,6 +1335,9 @@ static void write_uncompressed_header(VP10_COMP *cpi,
 #if CONFIG_CLPF
   encode_clpf(cm, wb);
 #endif
+#if CONFIG_DERING
+  encode_dering(cm->dering_level, wb);
+#endif  // CONFIG_DERING
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 9857c0ee2..01be7cc70 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -18,6 +18,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/filter.h"
 #include "vp10/common/idct.h"
 #include "vp10/common/reconinter.h"
@@ -2432,6 +2435,16 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
       vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
   }
 
+#if CONFIG_DERING
+  if (is_lossless_requested(&cpi->oxcf)) {
+    cm->dering_level = 0;
+  } else {
+    cm->dering_level = vp10_dering_search(cm->frame_to_show, cpi->Source, cm,
+                                          xd);
+    vp10_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
+  }
+#endif  // CONFIG_DERING
+
 #if CONFIG_CLPF
   cm->clpf = 0;
   if (!is_lossless_requested(&cpi->oxcf)) {
diff --git a/vp10/encoder/pickdering.c b/vp10/encoder/pickdering.c
new file mode 100644
index 000000000..479ce0c16
--- /dev/null
+++ b/vp10/encoder/pickdering.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "vp10/common/dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/encoder/encoder.h"
+#include "vpx/vpx_integer.h"
+
+static double compute_dist(int16_t *x, int xstride, int16_t *y, int ystride,
+    int nhb, int nvb, int coeff_shift) {
+  int i, j;
+  double sum;
+  sum = 0;
+  for (i = 0; i < nvb << 3; i++) {
+    for (j = 0; j < nhb << 3; j++) {
+      double tmp;
+      tmp = x[i*xstride + j] - y[i*ystride + j];
+      sum += tmp*tmp;
+    }
+  }
+  return sum/(double)(1 << 2*coeff_shift);
+}
+
+int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                       VP10_COMMON *cm,
+                       MACROBLOCKD *xd) {
+  int r, c;
+  int sbr, sbc;
+  int nhsb, nvsb;
+  dering_in *src;
+  int16_t *ref_coeff;
+  unsigned char *bskip;
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
+  int stride;
+  int bsize[3];
+  int dec[3];
+  int pli;
+  int (*mse)[MAX_DERING_LEVEL];
+  int best_count[MAX_DERING_LEVEL] = {0};
+  double tot_mse[MAX_DERING_LEVEL] = {0};
+  int level;
+  int best_level;
+  int global_level;
+  double best_tot_mse = 1e15;
+  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
+  src = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
+  ref_coeff = vpx_malloc(sizeof(*ref_coeff)*cm->mi_rows*cm->mi_cols*64);
+  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
+  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = 8 >> dec[pli];
+  }
+  stride = bsize[0]*cm->mi_cols;
+  for (r = 0; r < bsize[0]*cm->mi_rows; ++r) {
+    for (c = 0; c < bsize[0]*cm->mi_cols; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        src[r * stride + c] =
+            CONVERT_TO_SHORTPTR(xd->plane[0].dst.buf)
+            [r*xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] =
+            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
+      } else {
+#endif
+        src[r * stride + c] =
+            xd->plane[0].dst.buf[r*xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+      }
+#endif
+    }
+  }
+  for (r = 0; r < cm->mi_rows; ++r) {
+    for (c = 0; c < cm->mi_cols; ++c) {
+      const MB_MODE_INFO *mbmi =
+          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
+      bskip[r * cm->mi_cols + c] = mbmi->skip;
+    }
+  }
+  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  mse = vpx_malloc(nvsb*nhsb*sizeof(*mse));
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int best_mse = 1000000000;
+      int nvb, nhb;
+      int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
+      best_level = 0;
+      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
+      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
+      for (level = 0; level < 64; level++) {
+        int threshold;
+        threshold = level << coeff_shift;
+        od_dering(
+            &OD_DERING_VTBL_C,
+            dst,
+            MI_BLOCK_SIZE*bsize[0],
+            &src[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
+            sbc*bsize[0]*MI_BLOCK_SIZE],
+            cm->mi_cols*bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, dir, 0,
+            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
+            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+        mse[nhsb*sbr+sbc][level] = compute_dist(
+            dst, MI_BLOCK_SIZE*bsize[0],
+            &ref_coeff[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
+            sbc*bsize[0]*MI_BLOCK_SIZE],
+            stride, nhb, nvb, coeff_shift);
+        tot_mse[level] += mse[nhsb*sbr+sbc][level];
+        if (mse[nhsb*sbr+sbc][level] < best_mse) {
+          best_mse = mse[nhsb*sbr+sbc][level];
+          best_level = level;
+        }
+      }
+      best_count[best_level]++;
+    }
+  }
+#if DERING_REFINEMENT
+  best_level = 0;
+  /* Search for the best global level one value at a time. */
+  for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
+    double tot_mse = 0;
+    for (sbr = 0; sbr < nvsb; sbr++) {
+      for (sbc = 0; sbc < nhsb; sbc++) {
+        int gi;
+        int best_mse = mse[nhsb*sbr+sbc][0];
+        for (gi = 1; gi < 4; gi++) {
+          level = compute_level_from_index(global_level, gi);
+          if (mse[nhsb*sbr+sbc][level] < best_mse) {
+            best_mse = mse[nhsb*sbr+sbc][level];
+          }
+        }
+        tot_mse += best_mse;
+      }
+    }
+    if (tot_mse < best_tot_mse) {
+      best_level = global_level;
+      best_tot_mse = tot_mse;
+    }
+  }
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int gi;
+      int best_gi;
+      int best_mse = mse[nhsb*sbr+sbc][0];
+      best_gi = 0;
+      for (gi = 1; gi < 4; gi++) {
+        level = compute_level_from_index(best_level, gi);
+        if (mse[nhsb*sbr+sbc][level] < best_mse) {
+          best_gi = gi;
+          best_mse = mse[nhsb*sbr+sbc][level];
+        }
+      }
+      cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride + MI_BLOCK_SIZE*sbc]->
+          mbmi.dering_gain = best_gi;
+    }
+  }
+#else
+  best_level = 0;
+  for (level = 0; level < MAX_DERING_LEVEL; level++) {
+    if (tot_mse[level] < tot_mse[best_level]) best_level = level;
+  }
+#endif
+  vpx_free(src);
+  vpx_free(ref_coeff);
+  vpx_free(bskip);
+  vpx_free(mse);
+  return best_level;
+}
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index dcb2dd915..b01ffd47b 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -62,6 +62,10 @@ VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
 VP10_COMMON_SRCS-yes += common/clpf.c
 VP10_COMMON_SRCS-yes += common/clpf.h
+VP10_COMMON_SRCS-yes += common/od_dering.c
+VP10_COMMON_SRCS-yes += common/od_dering.h
+VP10_COMMON_SRCS-yes += common/dering.c
+VP10_COMMON_SRCS-yes += common/dering.h
 
 ifneq ($(CONFIG_VPX_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index b5a9cc091..3aaa0b0df 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -79,6 +79,7 @@ VP10_CX_SRCS-yes += encoder/temporal_filter.c
 VP10_CX_SRCS-yes += encoder/temporal_filter.h
 VP10_CX_SRCS-yes += encoder/mbgraph.c
 VP10_CX_SRCS-yes += encoder/mbgraph.h
+VP10_CX_SRCS-yes += encoder/pickdering.c
 
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c