Adds the Daala deringing filter as experimental

The deringing filter documentation is in: J.-M. Valin, The Daala Directional Deringing Filter, arXiv:1602.05975 [cs.MM], 2016. http://arxiv.org/pdf/1602.05975.pdf ntt-short1 results: MEDIUM (%) HIGH (%) PSNR -2.488088 -2.003235 PSNRHVS -1.588932 -0.797850 SSIM -1.522767 -1.287861 FASTSSIM 4.307822 3.983496 subset1 improvement is around 2-3% on PSNR (but the rate is mostly outside the AWCY testing range) Change-Id: Ic02344ce9faa509f5c3a50a0fb7a7b84a7977e72
2016-02-24 19:36:34 -05:00 · 2016-02-24 19:36:34 -05:00 · 51b7a99807
--- a/1
+++ b/1
@ -251,6 +251,7 @@ EXPERIMENT_LIST="
    emulate_hardware
    misc_fixes
    clpf
+    dering
 "
 CONFIG_LIST="
    dependency_tracking
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@ -88,6 +88,8 @@ typedef struct {

  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
  int_mv mv[2];
+  /* deringing gain *per-superblock* */
+  int8_t dering_gain;
 } MB_MODE_INFO;

 typedef struct MODE_INFO {
--- a/vp10/common/dering.c
+++ b/vp10/common/dering.c
@ -0,0 +1,153 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include <math.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/od_dering.h"
+
+
+int compute_level_from_index(int global_level, int gi) {
+  static const double dering_gains[4] = {0, .7, 1, 1.4};
+  int level;
+  if (global_level == 0) return 0;
+  level = (int)floor(.5 + global_level*dering_gains[gi]);
+  return clamp(level, gi, MAX_DERING_LEVEL-1);
+}
+
+int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col) {
+  int r, c;
+  int maxc, maxr;
+  int skip = 1;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+  if (maxr > MI_BLOCK_SIZE) maxr = MI_BLOCK_SIZE;
+  if (maxc > MI_BLOCK_SIZE) maxc = MI_BLOCK_SIZE;
+  for (r = 0; r < maxr; r++) {
+    for (c = 0; c < maxc; c++) {
+      skip = skip &&
+          cm->mi_grid_visible[(mi_row + r)*cm->mi_stride + mi_col + c]->
+          mbmi.skip;
+    }
+  }
+  return skip;
+}
+
+void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
+                       MACROBLOCKD *xd, int global_level) {
+  int r, c;
+  int sbr, sbc;
+  int nhsb, nvsb;
+  dering_in *src[3];
+  unsigned char *bskip;
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
+  int stride;
+  int bsize[3];
+  int dec[3];
+  int pli;
+  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
+  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
+  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = 8 >> dec[pli];
+  }
+  stride = bsize[0]*cm->mi_cols;
+  for (pli = 0; pli < 3; pli++) {
+    src[pli] = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
+    for (r = 0; r < bsize[pli]*cm->mi_rows; ++r) {
+      for (c = 0; c < bsize[pli]*cm->mi_cols; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          src[pli][r * stride + c] =
+              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
+              [r * xd->plane[pli].dst.stride + c];
+        } else {
+#endif
+          src[pli][r * stride + c] =
+              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+        }
+#endif
+      }
+    }
+  }
+  for (r = 0; r < cm->mi_rows; ++r) {
+    for (c = 0; c < cm->mi_cols; ++c) {
+      const MB_MODE_INFO *mbmi =
+          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
+      bskip[r * cm->mi_cols + c] = mbmi->skip;
+    }
+  }
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int level;
+      int nhb, nvb;
+      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
+      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
+      for (pli = 0; pli < 3; pli++) {
+        int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
+        int threshold;
+#if DERING_REFINEMENT
+        level = compute_level_from_index(
+            global_level,
+            cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride +
+            MI_BLOCK_SIZE*sbc]->mbmi.dering_gain);
+#else
+        level = global_level;
+#endif
+        /* FIXME: This is a temporary hack that uses more conservative
+           deringing for chroma. */
+        if (pli) level = level*2/3;
+        if (sb_all_skip(cm, sbr*MI_BLOCK_SIZE, sbc*MI_BLOCK_SIZE)) level = 0;
+        threshold = level << coeff_shift;
+        od_dering(
+            &OD_DERING_VTBL_C,
+            dst,
+            MI_BLOCK_SIZE*bsize[pli],
+            &src[pli][sbr*stride*bsize[pli]*MI_BLOCK_SIZE +
+            sbc*bsize[pli]*MI_BLOCK_SIZE],
+            stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
+            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
+            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+        for (r = 0; r < bsize[pli]*nvb; ++r) {
+          for (c = 0; c < bsize[pli]*nhb; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+            if (cm->use_highbitdepth) {
+              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
+                  [xd->plane[pli].dst.stride*(bsize[pli]*MI_BLOCK_SIZE*sbr + r)
+                  + sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
+                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
+            } else {
+#endif
+              xd->plane[pli].dst.buf[xd->plane[pli].dst.stride*
+                  (bsize[pli]*MI_BLOCK_SIZE*sbr + r) +
+                  sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
+                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+            }
+#endif
+          }
+        }
+      }
+    }
+  }
+  for (pli = 0; pli < 3; pli++) {
+    vpx_free(src[pli]);
+  }
+  vpx_free(bskip);
+}
--- a/vp10/common/dering.h
+++ b/vp10/common/dering.h
@ -0,0 +1,31 @@
+#ifndef VP10_COMMON_DERING_H_
+#define VP10_COMMON_DERING_H_
+
+#include "vp10/common/od_dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DERING_LEVEL_BITS 6
+#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
+
+#define DERING_REFINEMENT 1
+
+int compute_level_from_index(int global_level, int gi);
+int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col);
+void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
+                       MACROBLOCKD *xd, int global_level);
+
+int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                      VP10_COMMON *cm,
+                      MACROBLOCKD *xd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VP10_COMMON_DERING_H_
--- a/vp10/common/od_dering.c
+++ b/vp10/common/od_dering.c
@ -0,0 +1,343 @@
+/*Daala video codec
+Copyright (c) 2014-2016 Daala project contributors.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "dering.h"
+
+const od_dering_opt_vtbl OD_DERING_VTBL_C = {
+  {od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c},
+  {od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c}
+};
+
+/* Generated from gen_filter_tables.c. */
+const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
+  {-1*OD_FILT_BSTRIDE + 1, -2*OD_FILT_BSTRIDE + 2, -3*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1, -1*OD_FILT_BSTRIDE + 2, -1*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1,  0*OD_FILT_BSTRIDE + 2,  0*OD_FILT_BSTRIDE + 3  },
+  { 0*OD_FILT_BSTRIDE + 1,  1*OD_FILT_BSTRIDE + 2,  1*OD_FILT_BSTRIDE + 3  },
+  { 1*OD_FILT_BSTRIDE + 1,  2*OD_FILT_BSTRIDE + 2,  3*OD_FILT_BSTRIDE + 3  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 1,  3*OD_FILT_BSTRIDE + 1  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 0,  3*OD_FILT_BSTRIDE + 0  },
+  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE - 1,  3*OD_FILT_BSTRIDE - 1  },
+};
+
+const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = {
+  0, 0.5, 0.707, 1, 1.41, 2
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+static int od_dir_find8(const dering_in *img, int stride, int32_t *var,
+    int coeff_shift) {
+  int i;
+  int cost[8] = {0};
+  int partial[8][15] = {{0}};
+  int best_cost = 0;
+  int best_dir = 0;
+  for (i = 0; i < 8; i++) {
+    int j;
+    for (j = 0; j < 8; j++) {
+      int x;
+      x = img[i*stride + j] >> coeff_shift;
+      partial[0][i + j] += x;
+      partial[1][i + j/2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j/2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i/2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i/2 + j] += x;
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    cost[2] += partial[2][i]*partial[2][i] >> 3;
+    cost[6] += partial[6][i]*partial[6][i] >> 3;
+  }
+  for (i = 0; i < 7; i++) {
+    cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
+     + OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
+    cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
+     + OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
+  }
+  cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
+  cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
+  for (i = 1; i < 8; i += 2) {
+    int j;
+    for (j = 0; j < 4 + 1; j++) {
+      cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
+    }
+    for (j = 0; j < 4 - 1; j++) {
+      cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
+       + OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  return best_dir;
+}
+
+#define OD_DERING_VERY_LARGE (30000)
+#define OD_DERING_INBUF_SIZE ((OD_BSIZE_MAX + 2*OD_FILT_BORDER)*\
+ (OD_BSIZE_MAX + 2*OD_FILT_BORDER))
+
+/* Smooth in the direction detected. */
+void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
+ int ln, int threshold, int dir) {
+  int i;
+  int j;
+  int k;
+  static const int taps[3] = {3, 2, 2};
+  for (i = 0; i < 1 << ln; i++) {
+    for (j = 0; j < 1 << ln; j++) {
+      int16_t sum;
+      int16_t xx;
+      int16_t yy;
+      xx = in[i*OD_FILT_BSTRIDE + j];
+      sum= 0;
+      for (k = 0; k < 3; k++) {
+        int16_t p0;
+        int16_t p1;
+        p0 = in[i*OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]]
+         - xx;
+        p1 = in[i*OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]]
+         - xx;
+        if (abs(p0) < threshold) sum += taps[k]*p0;
+        if (abs(p1) < threshold) sum += taps[k]*p1;
+      }
+      yy = xx + ((sum + 8) >> 4);
+      y[i*ystride + j] = yy;
+    }
+  }
+}
+
+void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir) {
+  od_filter_dering_direction_c(y, ystride, in, 2, threshold, dir);
+}
+
+void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir) {
+  od_filter_dering_direction_c(y, ystride, in, 3, threshold, dir);
+}
+
+/* Smooth in the direction orthogonal to what was detected. */
+void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
+ const dering_in *x, int xstride, int ln, int threshold, int dir) {
+  int i;
+  int j;
+  int offset;
+  if (dir <= 4) offset = OD_FILT_BSTRIDE;
+  else offset = 1;
+  for (i = 0; i < 1 << ln; i++) {
+    for (j = 0; j < 1 << ln; j++) {
+      int16_t athresh;
+      int16_t yy;
+      int16_t sum;
+      int16_t p;
+      /* Deringing orthogonal to the direction uses a tighter threshold
+         because we want to be conservative. We've presumably already
+         achieved some deringing, so the amount of change is expected
+         to be low. Also, since we might be filtering across an edge, we
+         want to make sure not to blur it. That being said, we might want
+         to be a little bit more aggressive on pure horizontal/vertical
+         since the ringing there tends to be directional, so it doesn't
+         get removed by the directional filtering. */
+      athresh = OD_MINI(threshold, threshold/3
+       + abs(in[i*OD_FILT_BSTRIDE + j] - x[i*xstride + j]));
+      yy = in[i*OD_FILT_BSTRIDE + j];
+      sum = 0;
+      p = in[i*OD_FILT_BSTRIDE + j + offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j - offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j + 2*offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      p = in[i*OD_FILT_BSTRIDE + j - 2*offset] - yy;
+      if (abs(p) < athresh) sum += p;
+      y[i*ystride + j] = yy + ((3*sum + 8) >> 4);
+    }
+  }
+}
+
+void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
+  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 2, threshold, dir);
+}
+
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
+  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 3, threshold, dir);
+}
+
+/* This table approximates x^0.16 with the index being log2(x). It is clamped
+   to [-.5, 3]. The table is computed as:
+   round(256*min(3, max(.5, 1.08*(sqrt(2)*2.^([0:17]+8)/256/256).^.16))) */
+static const int16_t OD_THRESH_TABLE_Q8[18] = {
+  128, 134, 150, 168, 188, 210, 234, 262,
+  292, 327, 365, 408, 455, 509, 569, 635,
+  710, 768,
+};
+
+/* Compute deringing filter threshold for each 8x8 block based on the
+   directional variance difference. A high variance difference means that we
+   have a highly directional pattern (e.g. a high contrast edge), so we can
+   apply more deringing. A low variance means that we either have a low
+   contrast edge, or a non-directional texture, so we want to be careful not
+   to blur. */
+static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+ int threshold, int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+ int32_t sb_var, int nhb, int nvb) {
+  int bx;
+  int by;
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      int v1;
+      int v2;
+      /* We use both the variance of 8x8 blocks and the variance of the
+         entire superblock to determine the threshold. */
+      v1 = OD_MINI(32767, var[by][bx] >> 6);
+      v2 = OD_MINI(32767, sb_var/(OD_BSIZE_MAX*OD_BSIZE_MAX));
+      thresh[by][bx] = threshold*OD_THRESH_TABLE_Q8[OD_CLAMPI(0,
+       OD_ILOG(v1*v2) - 9, 17)] >> 8;
+    }
+  }
+}
+
+void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
+ const dering_in *x, int xstride, int nhb, int nvb, int sbx, int sby, int nhsb,
+ int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ unsigned char *bskip, int skip_stride, int threshold, int overlap,
+ int coeff_shift) {
+  int i;
+  int j;
+  int bx;
+  int by;
+  int16_t inbuf[OD_DERING_INBUF_SIZE];
+  int16_t *in;
+  int bsize;
+  int varsum = 0;
+  int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  bsize = 3 - xdec;
+  in = inbuf + OD_FILT_BORDER*OD_FILT_BSTRIDE + OD_FILT_BORDER;
+  /* We avoid filtering the pixels for which some of the pixels to average
+     are outside the frame. We could change the filter instead, but it would
+     add special cases for any future vectorization. */
+  for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
+  for (i = -OD_FILT_BORDER*(sby != 0); i < (nvb << bsize)
+   + OD_FILT_BORDER*(sby != nvsb - 1); i++) {
+    for (j = -OD_FILT_BORDER*(sbx != 0); j < (nhb << bsize)
+     + OD_FILT_BORDER*(sbx != nhsb - 1); j++) {
+      in[i*OD_FILT_BSTRIDE + j] = x[i*xstride + j];
+    }
+  }
+  if (pli == 0) {
+    for (by = 0; by < nvb; by++) {
+      for (bx = 0; bx < nhb; bx++) {
+        dir[by][bx] = od_dir_find8(&x[8*by*xstride + 8*bx], xstride,
+         &var[by][bx], coeff_shift);
+        varsum += var[by][bx];
+      }
+    }
+    od_compute_thresh(thresh, threshold, var, varsum, nhb, nvb);
+  }
+  else {
+    for (by = 0; by < nvb; by++) {
+      for (bx = 0; bx < nhb; bx++) {
+        thresh[by][bx] = threshold;
+      }
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      int skip;
+# if defined(DAALA_ODINTRIN)
+      int xstart;
+      int ystart;
+      int xend;
+      int yend;
+      xstart = ystart = 0;
+      xend = yend = (2 >> xdec);
+      if (overlap) {
+        xstart -= (sbx != 0);
+        ystart -= (sby != 0);
+        xend += (sbx != nhsb - 1);
+        yend += (sby != nvsb - 1);
+      }
+      skip = 1;
+      /* We look at whether the current block and its 4x4 surrounding (due to
+         lapping) are skipped to avoid filtering the same content multiple
+         times. */
+      for (i = ystart; i < yend; i++) {
+        for (j = xstart; j < xend; j++) {
+          skip = skip && bskip[((by << 1 >> xdec) + i)*skip_stride
+           + (bx << 1 >> xdec) + j];
+        }
+      }
+#else
+      skip = bskip[by*skip_stride + bx];
+#endif
+      if (skip) thresh[by][bx] = 0;
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
+       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+       thresh[by][bx], dir[by][bx]);
+    }
+  }
+  for (i = 0; i < nvb << bsize; i++) {
+    for (j = 0; j < nhb << bsize; j++) {
+      in[i*OD_FILT_BSTRIDE + j] = y[i*ystride + j];
+    }
+  }
+  for (by = 0; by < nvb; by++) {
+    for (bx = 0; bx < nhb; bx++) {
+      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
+       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+       &x[(by*xstride << bsize) + (bx << bsize)], xstride,
+       thresh[by][bx], dir[by][bx]);
+    }
+  }
+}
--- a/vp10/common/od_dering.h
+++ b/vp10/common/od_dering.h
@ -0,0 +1,83 @@
+/*Daala video codec
+Copyright (c) 2003-2010 Daala project contributors.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#if !defined(_dering_H)
+# define _dering_H (1)
+
+# include "odintrin.h"
+
+# if defined(DAALA_ODINTRIN)
+#  include "filter.h"
+typedef int16_t dering_in;
+# endif
+
+#define OD_DERINGSIZES (2)
+
+#define OD_DERING_NO_CHECK_OVERLAP (0)
+#define OD_DERING_CHECK_OVERLAP (1)
+
+#define OD_DERING_LEVELS (6)
+extern const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS];
+
+#define OD_DERING_NBLOCKS (OD_BSIZE_MAX/8)
+
+#define OD_FILT_BORDER (3)
+#define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2*OD_FILT_BORDER)
+
+extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
+
+typedef void (*od_filter_dering_direction_func)(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+
+struct od_dering_opt_vtbl {
+  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
+  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
+};
+typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
+
+
+void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
+ const dering_in *x, int xstride, int nvb, int nhb, int sbx, int sby, int nhsb,
+ int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ unsigned char *bskip, int skip_stride, int threshold, int overlap,
+ int coeff_shift);
+void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
+ int ln, int threshold, int dir);
+void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
+ const dering_in *x, int xstride, int ln, int threshold, int dir);
+
+extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
+
+void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold, int dir);
+void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
+
+#endif
--- a/vp10/common/odintrin.h
+++ b/vp10/common/odintrin.h
@ -0,0 +1,29 @@
+#include "vp10/common/enums.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/bitops.h"
+
+/*Smallest blocks are 4x4*/
+# define OD_LOG_BSIZE0 (2)
+/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
+# define OD_NBSIZES    (5)
+/*The log of the maximum length of the side of a block.*/
+# define OD_LOG_BSIZE_MAX (OD_LOG_BSIZE0 + OD_NBSIZES - 1)
+/*The maximum length of the side of a block.*/
+# define OD_BSIZE_MAX     (1 << OD_LOG_BSIZE_MAX)
+
+typedef int od_coeff;
+
+typedef int16_t dering_in;
+
+#define OD_DIVU_SMALL(_x, _d) ((_x) / (_d))
+
+#define OD_MINI VPXMIN
+#define OD_CLAMPI(min, val, max) clamp((val), (min), (max))
+
+#  define OD_ILOG_NZ(x) get_msb(x)
+/*Note that __builtin_clz is not defined when x == 0, according to the gcc
+ *    documentation (and that of the x86 BSR instruction that implements it), so
+ *       we have to special-case it.
+ *         We define a special version of the macro to use when x can be zero.*/
+#  define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@ -312,6 +312,9 @@ typedef struct VP10Common {
  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
  // each keyframe and not used afterwards
  vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_DERING
+  int dering_level;
+#endif
 } VP10_COMMON;

 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@ -14,6 +14,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_config.h"

 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/bitreader.h"
@ -29,6 +30,9 @@
 #include "vp10/common/clpf.h"
 #endif
 #include "vp10/common/common.h"
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/idct.h"
@ -967,6 +971,17 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+
+#if DERING_REFINEMENT
+  if (bsize == BLOCK_64X64) {
+    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
+      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain =
+          vpx_read_literal(r, 2);
+    } else {
+      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain = 0;
+    }
+  }
+#endif
 }

 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@ -1103,6 +1118,12 @@ static void setup_clpf(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
 }
 #endif

+#if CONFIG_DERING
+static void setup_dering(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  cm->dering_level = vpx_rb_read_literal(rb,  DERING_LEVEL_BITS);
+}
+#endif  // CONFIG_DERING
+
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
  return vpx_rb_read_bit(rb)
             ? vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4)
@ -1603,6 +1624,11 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi, const uint8_t *data,
  if (cm->clpf && !cm->skip_loop_filter)
    vp10_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
 #endif
+#if CONFIG_DERING
+  if (cm->dering_level && !cm->skip_loop_filter) {
+    vp10_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
+  }
+#endif  // CONFIG_DERING

  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
@ -2088,6 +2114,9 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  setup_loopfilter(&cm->lf, rb);
 #if CONFIG_CLPF
  setup_clpf(cm, rb);
+#endif
+#if CONFIG_DERING
+  setup_dering(cm, rb);
 #endif
  setup_quantization(cm, rb);
 #if CONFIG_VPX_HIGHBITDEPTH
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@ -22,6 +22,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/entropymv.h"
@ -602,6 +605,15 @@ static void write_modes_sb(VP10_COMP *cpi, const TileInfo *const tile,
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+
+#if DERING_REFINEMENT
+  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
+      !sb_all_skip(cm, mi_row, mi_col)) {
+    vpx_write_literal(
+        w, cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain,
+        2);
+  }
+#endif
 }

 static void write_modes(VP10_COMP *cpi, const TileInfo *const tile,
@ -848,6 +860,12 @@ static void encode_clpf(const VP10_COMMON *cm,
 }
 #endif

+#if CONFIG_DERING
+static void encode_dering(int level, struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, level, DERING_LEVEL_BITS);
+}
+#endif  // CONFIG_DERING
+
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
  if (delta_q != 0) {
    vpx_wb_write_bit(wb, 1);
@ -1317,6 +1335,9 @@ static void write_uncompressed_header(VP10_COMP *cpi,
 #if CONFIG_CLPF
  encode_clpf(cm, wb);
 #endif
+#if CONFIG_DERING
+  encode_dering(cm->dering_level, wb);
+#endif  // CONFIG_DERING
  encode_quantization(cm, wb);
  encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -18,6 +18,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
+#if CONFIG_DERING
+#include "vp10/common/dering.h"
+#endif  // CONFIG_DERING
 #include "vp10/common/filter.h"
 #include "vp10/common/idct.h"
 #include "vp10/common/reconinter.h"
@ -2432,6 +2435,16 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
      vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
  }

+#if CONFIG_DERING
+  if (is_lossless_requested(&cpi->oxcf)) {
+    cm->dering_level = 0;
+  } else {
+    cm->dering_level = vp10_dering_search(cm->frame_to_show, cpi->Source, cm,
+                                          xd);
+    vp10_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
+  }
+#endif  // CONFIG_DERING
+
 #if CONFIG_CLPF
  cm->clpf = 0;
  if (!is_lossless_requested(&cpi->oxcf)) {
--- a/vp10/encoder/pickdering.c
+++ b/vp10/encoder/pickdering.c
@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "vp10/common/dering.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/encoder/encoder.h"
+#include "vpx/vpx_integer.h"
+
+static double compute_dist(int16_t *x, int xstride, int16_t *y, int ystride,
+    int nhb, int nvb, int coeff_shift) {
+  int i, j;
+  double sum;
+  sum = 0;
+  for (i = 0; i < nvb << 3; i++) {
+    for (j = 0; j < nhb << 3; j++) {
+      double tmp;
+      tmp = x[i*xstride + j] - y[i*ystride + j];
+      sum += tmp*tmp;
+    }
+  }
+  return sum/(double)(1 << 2*coeff_shift);
+}
+
+int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                       VP10_COMMON *cm,
+                       MACROBLOCKD *xd) {
+  int r, c;
+  int sbr, sbc;
+  int nhsb, nvsb;
+  dering_in *src;
+  int16_t *ref_coeff;
+  unsigned char *bskip;
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
+  int stride;
+  int bsize[3];
+  int dec[3];
+  int pli;
+  int (*mse)[MAX_DERING_LEVEL];
+  int best_count[MAX_DERING_LEVEL] = {0};
+  double tot_mse[MAX_DERING_LEVEL] = {0};
+  int level;
+  int best_level;
+  int global_level;
+  double best_tot_mse = 1e15;
+  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
+  src = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
+  ref_coeff = vpx_malloc(sizeof(*ref_coeff)*cm->mi_rows*cm->mi_cols*64);
+  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
+  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = 8 >> dec[pli];
+  }
+  stride = bsize[0]*cm->mi_cols;
+  for (r = 0; r < bsize[0]*cm->mi_rows; ++r) {
+    for (c = 0; c < bsize[0]*cm->mi_cols; ++c) {
+#if CONFIG_VPX_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        src[r * stride + c] =
+            CONVERT_TO_SHORTPTR(xd->plane[0].dst.buf)
+            [r*xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] =
+            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
+      } else {
+#endif
+        src[r * stride + c] =
+            xd->plane[0].dst.buf[r*xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
+#if CONFIG_VPX_HIGHBITDEPTH
+      }
+#endif
+    }
+  }
+  for (r = 0; r < cm->mi_rows; ++r) {
+    for (c = 0; c < cm->mi_cols; ++c) {
+      const MB_MODE_INFO *mbmi =
+          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
+      bskip[r * cm->mi_cols + c] = mbmi->skip;
+    }
+  }
+  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
+  mse = vpx_malloc(nvsb*nhsb*sizeof(*mse));
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int best_mse = 1000000000;
+      int nvb, nhb;
+      int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
+      best_level = 0;
+      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
+      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
+      for (level = 0; level < 64; level++) {
+        int threshold;
+        threshold = level << coeff_shift;
+        od_dering(
+            &OD_DERING_VTBL_C,
+            dst,
+            MI_BLOCK_SIZE*bsize[0],
+            &src[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
+            sbc*bsize[0]*MI_BLOCK_SIZE],
+            cm->mi_cols*bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, dir, 0,
+            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
+            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+        mse[nhsb*sbr+sbc][level] = compute_dist(
+            dst, MI_BLOCK_SIZE*bsize[0],
+            &ref_coeff[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
+            sbc*bsize[0]*MI_BLOCK_SIZE],
+            stride, nhb, nvb, coeff_shift);
+        tot_mse[level] += mse[nhsb*sbr+sbc][level];
+        if (mse[nhsb*sbr+sbc][level] < best_mse) {
+          best_mse = mse[nhsb*sbr+sbc][level];
+          best_level = level;
+        }
+      }
+      best_count[best_level]++;
+    }
+  }
+#if DERING_REFINEMENT
+  best_level = 0;
+  /* Search for the best global level one value at a time. */
+  for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
+    double tot_mse = 0;
+    for (sbr = 0; sbr < nvsb; sbr++) {
+      for (sbc = 0; sbc < nhsb; sbc++) {
+        int gi;
+        int best_mse = mse[nhsb*sbr+sbc][0];
+        for (gi = 1; gi < 4; gi++) {
+          level = compute_level_from_index(global_level, gi);
+          if (mse[nhsb*sbr+sbc][level] < best_mse) {
+            best_mse = mse[nhsb*sbr+sbc][level];
+          }
+        }
+        tot_mse += best_mse;
+      }
+    }
+    if (tot_mse < best_tot_mse) {
+      best_level = global_level;
+      best_tot_mse = tot_mse;
+    }
+  }
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int gi;
+      int best_gi;
+      int best_mse = mse[nhsb*sbr+sbc][0];
+      best_gi = 0;
+      for (gi = 1; gi < 4; gi++) {
+        level = compute_level_from_index(best_level, gi);
+        if (mse[nhsb*sbr+sbc][level] < best_mse) {
+          best_gi = gi;
+          best_mse = mse[nhsb*sbr+sbc][level];
+        }
+      }
+      cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride + MI_BLOCK_SIZE*sbc]->
+          mbmi.dering_gain = best_gi;
+    }
+  }
+#else
+  best_level = 0;
+  for (level = 0; level < MAX_DERING_LEVEL; level++) {
+    if (tot_mse[level] < tot_mse[best_level]) best_level = level;
+  }
+#endif
+  vpx_free(src);
+  vpx_free(ref_coeff);
+  vpx_free(bskip);
+  vpx_free(mse);
+  return best_level;
+}
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@ -62,6 +62,10 @@ VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
 VP10_COMMON_SRCS-yes += common/clpf.c
 VP10_COMMON_SRCS-yes += common/clpf.h
+VP10_COMMON_SRCS-yes += common/od_dering.c
+VP10_COMMON_SRCS-yes += common/od_dering.h
+VP10_COMMON_SRCS-yes += common/dering.c
+VP10_COMMON_SRCS-yes += common/dering.h

 ifneq ($(CONFIG_VPX_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@ -79,6 +79,7 @@ VP10_CX_SRCS-yes += encoder/temporal_filter.c
 VP10_CX_SRCS-yes += encoder/temporal_filter.h
 VP10_CX_SRCS-yes += encoder/mbgraph.c
 VP10_CX_SRCS-yes += encoder/mbgraph.h
+VP10_CX_SRCS-yes += encoder/pickdering.c

 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c