Adds the Daala deringing filter as experimental

The deringing filter documentation is in: J.-M. Valin, The Daala Directional Deringing Filter, arXiv:1602.05975 [cs.MM], 2016. http://arxiv.org/pdf/1602.05975.pdf ntt-short1 results: MEDIUM (%) HIGH (%) PSNR -2.488088 -2.003235 PSNRHVS -1.588932 -0.797850 SSIM -1.522767 -1.287861 FASTSSIM 4.307822 3.983496 subset1 improvement is around 2-3% on PSNR (but the rate is mostly outside the AWCY testing range) Change-Id: Ic02344ce9faa509f5c3a50a0fb7a7b84a7977e72
2016-02-24 19:36:34 -05:00 · 2016-02-24 19:36:34 -05:00 · 51b7a99807
--- a/1
+++ b/1
@ -251,6 +251,7 @@ EXPERIMENT_LIST="
    emulate_hardware
    misc_fixes
    clpf
    dering
 "
 CONFIG_LIST="
    dependency_tracking
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@ -88,6 +88,8 @@ typedef struct {
  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
  int_mv mv[2];
  /* deringing gain *per-superblock* */
  int8_t dering_gain;
 } MB_MODE_INFO;
 typedef struct MODE_INFO {
--- a/vp10/common/dering.c
+++ b/vp10/common/dering.c
@ -0,0 +1,153 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string.h>
 #include <math.h>
 #include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vp10/common/dering.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/reconinter.h"
 #include "vp10/common/od_dering.h"
 int compute_level_from_index(int global_level, int gi) {
  static const double dering_gains[4] = {0, .7, 1, 1.4};
  int level;
  if (global_level == 0) return 0;
  level = (int)floor(.5 + global_level*dering_gains[gi]);
  return clamp(level, gi, MAX_DERING_LEVEL-1);
 }
 int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col) {
  int r, c;
  int maxc, maxr;
  int skip = 1;
  maxc = cm->mi_cols - mi_col;
  maxr = cm->mi_rows - mi_row;
  if (maxr > MI_BLOCK_SIZE) maxr = MI_BLOCK_SIZE;
  if (maxc > MI_BLOCK_SIZE) maxc = MI_BLOCK_SIZE;
  for (r = 0; r < maxr; r++) {
    for (c = 0; c < maxc; c++) {
      skip = skip &&
          cm->mi_grid_visible[(mi_row + r)*cm->mi_stride + mi_col + c]->
          mbmi.skip;
    }
  }
  return skip;
 }
 void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
                       MACROBLOCKD *xd, int global_level) {
  int r, c;
  int sbr, sbc;
  int nhsb, nvsb;
  dering_in *src[3];
  unsigned char *bskip;
  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
  int stride;
  int bsize[3];
  int dec[3];
  int pli;
  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
  for (pli = 0; pli < 3; pli++) {
    dec[pli] = xd->plane[pli].subsampling_x;
    bsize[pli] = 8 >> dec[pli];
  }
  stride = bsize[0]*cm->mi_cols;
  for (pli = 0; pli < 3; pli++) {
    src[pli] = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
    for (r = 0; r < bsize[pli]*cm->mi_rows; ++r) {
      for (c = 0; c < bsize[pli]*cm->mi_cols; ++c) {
 #if CONFIG_VPX_HIGHBITDEPTH
        if (cm->use_highbitdepth) {
          src[pli][r * stride + c] =
              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
              [r * xd->plane[pli].dst.stride + c];
        } else {
 #endif
          src[pli][r * stride + c] =
              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
 #if CONFIG_VPX_HIGHBITDEPTH
        }
 #endif
      }
    }
  }
  for (r = 0; r < cm->mi_rows; ++r) {
    for (c = 0; c < cm->mi_cols; ++c) {
      const MB_MODE_INFO *mbmi =
          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
      bskip[r * cm->mi_cols + c] = mbmi->skip;
    }
  }
  for (sbr = 0; sbr < nvsb; sbr++) {
    for (sbc = 0; sbc < nhsb; sbc++) {
      int level;
      int nhb, nvb;
      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
      for (pli = 0; pli < 3; pli++) {
        int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
        int threshold;
 #if DERING_REFINEMENT
        level = compute_level_from_index(
            global_level,
            cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride +
            MI_BLOCK_SIZE*sbc]->mbmi.dering_gain);
 #else
        level = global_level;
 #endif
        /* FIXME: This is a temporary hack that uses more conservative
           deringing for chroma. */
        if (pli) level = level*2/3;
        if (sb_all_skip(cm, sbr*MI_BLOCK_SIZE, sbc*MI_BLOCK_SIZE)) level = 0;
        threshold = level << coeff_shift;
        od_dering(
            &OD_DERING_VTBL_C,
            dst,
            MI_BLOCK_SIZE*bsize[pli],
            &src[pli][sbr*stride*bsize[pli]*MI_BLOCK_SIZE +
            sbc*bsize[pli]*MI_BLOCK_SIZE],
            stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
        for (r = 0; r < bsize[pli]*nvb; ++r) {
          for (c = 0; c < bsize[pli]*nhb; ++c) {
 #if CONFIG_VPX_HIGHBITDEPTH
            if (cm->use_highbitdepth) {
              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
                  [xd->plane[pli].dst.stride*(bsize[pli]*MI_BLOCK_SIZE*sbr + r)
                  + sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
            } else {
 #endif
              xd->plane[pli].dst.buf[xd->plane[pli].dst.stride*
                  (bsize[pli]*MI_BLOCK_SIZE*sbr + r) +
                  sbc*bsize[pli]*MI_BLOCK_SIZE + c] =
                  dst[r * MI_BLOCK_SIZE * bsize[pli] + c];
 #if CONFIG_VPX_HIGHBITDEPTH
            }
 #endif
          }
        }
      }
    }
  }
  for (pli = 0; pli < 3; pli++) {
    vpx_free(src[pli]);
  }
  vpx_free(bskip);
 }
--- a/vp10/common/dering.h
+++ b/vp10/common/dering.h
@ -0,0 +1,31 @@
 #ifndef VP10_COMMON_DERING_H_
 #define VP10_COMMON_DERING_H_
 #include "vp10/common/od_dering.h"
 #include "vp10/common/onyxc_int.h"
 #include "vpx/vpx_integer.h"
 #include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define DERING_LEVEL_BITS 6
 #define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
 #define DERING_REFINEMENT 1
 int compute_level_from_index(int global_level, int gi);
 int sb_all_skip(const VP10_COMMON *const cm, int mi_row, int mi_col);
 void vp10_dering_frame(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm,
                       MACROBLOCKD *xd, int global_level);
 int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                      VP10_COMMON *cm,
                      MACROBLOCKD *xd);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP10_COMMON_DERING_H_
--- a/vp10/common/od_dering.c
+++ b/vp10/common/od_dering.c
@ -0,0 +1,343 @@
 /*Daala video codec
 Copyright (c) 2014-2016 Daala project contributors.  All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 - Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 - Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
 #ifdef HAVE_CONFIG_H
 # include "config.h"
 #endif
 #include <stdlib.h>
 #include <math.h>
 #include "dering.h"
 const od_dering_opt_vtbl OD_DERING_VTBL_C = {
  {od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c},
  {od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c}
 };
 /* Generated from gen_filter_tables.c. */
 const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
  {-1*OD_FILT_BSTRIDE + 1, -2*OD_FILT_BSTRIDE + 2, -3*OD_FILT_BSTRIDE + 3  },
  { 0*OD_FILT_BSTRIDE + 1, -1*OD_FILT_BSTRIDE + 2, -1*OD_FILT_BSTRIDE + 3  },
  { 0*OD_FILT_BSTRIDE + 1,  0*OD_FILT_BSTRIDE + 2,  0*OD_FILT_BSTRIDE + 3  },
  { 0*OD_FILT_BSTRIDE + 1,  1*OD_FILT_BSTRIDE + 2,  1*OD_FILT_BSTRIDE + 3  },
  { 1*OD_FILT_BSTRIDE + 1,  2*OD_FILT_BSTRIDE + 2,  3*OD_FILT_BSTRIDE + 3  },
  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 1,  3*OD_FILT_BSTRIDE + 1  },
  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE + 0,  3*OD_FILT_BSTRIDE + 0  },
  { 1*OD_FILT_BSTRIDE + 0,  2*OD_FILT_BSTRIDE - 1,  3*OD_FILT_BSTRIDE - 1  },
 };
 const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = {
  0, 0.5, 0.707, 1, 1.41, 2
 };
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
   The search minimizes the weighted variance along all the lines in a
   particular direction, i.e. the squared error between the input and a
   "predicted" block where each pixel is replaced by the average along a line
   in a particular direction. Since each direction have the same sum(x^2) term,
   that term is never computed. See Section 2, step 2, of:
   http://jmvalin.ca/notes/intra_paint.pdf */
 static int od_dir_find8(const dering_in *img, int stride, int32_t *var,
    int coeff_shift) {
  int i;
  int cost[8] = {0};
  int partial[8][15] = {{0}};
  int best_cost = 0;
  int best_dir = 0;
  for (i = 0; i < 8; i++) {
    int j;
    for (j = 0; j < 8; j++) {
      int x;
      x = img[i*stride + j] >> coeff_shift;
      partial[0][i + j] += x;
      partial[1][i + j/2] += x;
      partial[2][i] += x;
      partial[3][3 + i - j/2] += x;
      partial[4][7 + i - j] += x;
      partial[5][3 - i/2 + j] += x;
      partial[6][j] += x;
      partial[7][i/2 + j] += x;
    }
  }
  for (i = 0; i < 8; i++) {
    cost[2] += partial[2][i]*partial[2][i] >> 3;
    cost[6] += partial[6][i]*partial[6][i] >> 3;
  }
  for (i = 0; i < 7; i++) {
    cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
     + OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
    cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
     + OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
  }
  cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
  cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
  for (i = 1; i < 8; i += 2) {
    int j;
    for (j = 0; j < 4 + 1; j++) {
      cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
    }
    for (j = 0; j < 4 - 1; j++) {
      cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
       + OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
    }
  }
  for (i = 0; i < 8; i++) {
    if (cost[i] > best_cost) {
      best_cost = cost[i];
      best_dir = i;
    }
  }
  /* Difference between the optimal variance and the variance along the
     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  *var = best_cost - cost[(best_dir + 4) & 7];
  return best_dir;
 }
 #define OD_DERING_VERY_LARGE (30000)
 #define OD_DERING_INBUF_SIZE ((OD_BSIZE_MAX + 2*OD_FILT_BORDER)*\
 (OD_BSIZE_MAX + 2*OD_FILT_BORDER))
 /* Smooth in the direction detected. */
 void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
 int ln, int threshold, int dir) {
  int i;
  int j;
  int k;
  static const int taps[3] = {3, 2, 2};
  for (i = 0; i < 1 << ln; i++) {
    for (j = 0; j < 1 << ln; j++) {
      int16_t sum;
      int16_t xx;
      int16_t yy;
      xx = in[i*OD_FILT_BSTRIDE + j];
      sum= 0;
      for (k = 0; k < 3; k++) {
        int16_t p0;
        int16_t p1;
        p0 = in[i*OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]]
         - xx;
        p1 = in[i*OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]]
         - xx;
        if (abs(p0) < threshold) sum += taps[k]*p0;
        if (abs(p1) < threshold) sum += taps[k]*p1;
      }
      yy = xx + ((sum + 8) >> 4);
      y[i*ystride + j] = yy;
    }
  }
 }
 void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
 const int16_t *in, int threshold, int dir) {
  od_filter_dering_direction_c(y, ystride, in, 2, threshold, dir);
 }
 void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
 const int16_t *in, int threshold, int dir) {
  od_filter_dering_direction_c(y, ystride, in, 3, threshold, dir);
 }
 /* Smooth in the direction orthogonal to what was detected. */
 void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
 const dering_in *x, int xstride, int ln, int threshold, int dir) {
  int i;
  int j;
  int offset;
  if (dir <= 4) offset = OD_FILT_BSTRIDE;
  else offset = 1;
  for (i = 0; i < 1 << ln; i++) {
    for (j = 0; j < 1 << ln; j++) {
      int16_t athresh;
      int16_t yy;
      int16_t sum;
      int16_t p;
      /* Deringing orthogonal to the direction uses a tighter threshold
         because we want to be conservative. We've presumably already
         achieved some deringing, so the amount of change is expected
         to be low. Also, since we might be filtering across an edge, we
         want to make sure not to blur it. That being said, we might want
         to be a little bit more aggressive on pure horizontal/vertical
         since the ringing there tends to be directional, so it doesn't
         get removed by the directional filtering. */
      athresh = OD_MINI(threshold, threshold/3
       + abs(in[i*OD_FILT_BSTRIDE + j] - x[i*xstride + j]));
      yy = in[i*OD_FILT_BSTRIDE + j];
      sum = 0;
      p = in[i*OD_FILT_BSTRIDE + j + offset] - yy;
      if (abs(p) < athresh) sum += p;
      p = in[i*OD_FILT_BSTRIDE + j - offset] - yy;
      if (abs(p) < athresh) sum += p;
      p = in[i*OD_FILT_BSTRIDE + j + 2*offset] - yy;
      if (abs(p) < athresh) sum += p;
      p = in[i*OD_FILT_BSTRIDE + j - 2*offset] - yy;
      if (abs(p) < athresh) sum += p;
      y[i*ystride + j] = yy + ((3*sum + 8) >> 4);
    }
  }
 }
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
 const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 2, threshold, dir);
 }
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
 const int16_t *in, const dering_in *x, int xstride, int threshold, int dir) {
  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 3, threshold, dir);
 }
 /* This table approximates x^0.16 with the index being log2(x). It is clamped
   to [-.5, 3]. The table is computed as:
   round(256*min(3, max(.5, 1.08*(sqrt(2)*2.^([0:17]+8)/256/256).^.16))) */
 static const int16_t OD_THRESH_TABLE_Q8[18] = {
  128, 134, 150, 168, 188, 210, 234, 262,
  292, 327, 365, 408, 455, 509, 569, 635,
  710, 768,
 };
 /* Compute deringing filter threshold for each 8x8 block based on the
   directional variance difference. A high variance difference means that we
   have a highly directional pattern (e.g. a high contrast edge), so we can
   apply more deringing. A low variance means that we either have a low
   contrast edge, or a non-directional texture, so we want to be careful not
   to blur. */
 static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
 int threshold, int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
 int32_t sb_var, int nhb, int nvb) {
  int bx;
  int by;
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
      int v1;
      int v2;
      /* We use both the variance of 8x8 blocks and the variance of the
         entire superblock to determine the threshold. */
      v1 = OD_MINI(32767, var[by][bx] >> 6);
      v2 = OD_MINI(32767, sb_var/(OD_BSIZE_MAX*OD_BSIZE_MAX));
      thresh[by][bx] = threshold*OD_THRESH_TABLE_Q8[OD_CLAMPI(0,
       OD_ILOG(v1*v2) - 9, 17)] >> 8;
    }
  }
 }
 void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
 const dering_in *x, int xstride, int nhb, int nvb, int sbx, int sby, int nhsb,
 int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
 unsigned char *bskip, int skip_stride, int threshold, int overlap,
 int coeff_shift) {
  int i;
  int j;
  int bx;
  int by;
  int16_t inbuf[OD_DERING_INBUF_SIZE];
  int16_t *in;
  int bsize;
  int varsum = 0;
  int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
  int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
  bsize = 3 - xdec;
  in = inbuf + OD_FILT_BORDER*OD_FILT_BSTRIDE + OD_FILT_BORDER;
  /* We avoid filtering the pixels for which some of the pixels to average
     are outside the frame. We could change the filter instead, but it would
     add special cases for any future vectorization. */
  for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
  for (i = -OD_FILT_BORDER*(sby != 0); i < (nvb << bsize)
   + OD_FILT_BORDER*(sby != nvsb - 1); i++) {
    for (j = -OD_FILT_BORDER*(sbx != 0); j < (nhb << bsize)
     + OD_FILT_BORDER*(sbx != nhsb - 1); j++) {
      in[i*OD_FILT_BSTRIDE + j] = x[i*xstride + j];
    }
  }
  if (pli == 0) {
    for (by = 0; by < nvb; by++) {
      for (bx = 0; bx < nhb; bx++) {
        dir[by][bx] = od_dir_find8(&x[8*by*xstride + 8*bx], xstride,
         &var[by][bx], coeff_shift);
        varsum += var[by][bx];
      }
    }
    od_compute_thresh(thresh, threshold, var, varsum, nhb, nvb);
  }
  else {
    for (by = 0; by < nvb; by++) {
      for (bx = 0; bx < nhb; bx++) {
        thresh[by][bx] = threshold;
      }
    }
  }
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
      int skip;
 # if defined(DAALA_ODINTRIN)
      int xstart;
      int ystart;
      int xend;
      int yend;
      xstart = ystart = 0;
      xend = yend = (2 >> xdec);
      if (overlap) {
        xstart -= (sbx != 0);
        ystart -= (sby != 0);
        xend += (sbx != nhsb - 1);
        yend += (sby != nvsb - 1);
      }
      skip = 1;
      /* We look at whether the current block and its 4x4 surrounding (due to
         lapping) are skipped to avoid filtering the same content multiple
         times. */
      for (i = ystart; i < yend; i++) {
        for (j = xstart; j < xend; j++) {
          skip = skip && bskip[((by << 1 >> xdec) + i)*skip_stride
           + (bx << 1 >> xdec) + j];
        }
      }
 #else
      skip = bskip[by*skip_stride + bx];
 #endif
      if (skip) thresh[by][bx] = 0;
    }
  }
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
      (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
       thresh[by][bx], dir[by][bx]);
    }
  }
  for (i = 0; i < nvb << bsize; i++) {
    for (j = 0; j < nhb << bsize; j++) {
      in[i*OD_FILT_BSTRIDE + j] = y[i*ystride + j];
    }
  }
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
       &y[(by*ystride << bsize) + (bx << bsize)], ystride,
       &in[(by*OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
       &x[(by*xstride << bsize) + (bx << bsize)], xstride,
       thresh[by][bx], dir[by][bx]);
    }
  }
 }
--- a/vp10/common/od_dering.h
+++ b/vp10/common/od_dering.h
@ -0,0 +1,83 @@
 /*Daala video codec
 Copyright (c) 2003-2010 Daala project contributors.  All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 - Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 - Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
 #if !defined(_dering_H)
 # define _dering_H (1)
 # include "odintrin.h"
 # if defined(DAALA_ODINTRIN)
 #  include "filter.h"
 typedef int16_t dering_in;
 # endif
 #define OD_DERINGSIZES (2)
 #define OD_DERING_NO_CHECK_OVERLAP (0)
 #define OD_DERING_CHECK_OVERLAP (1)
 #define OD_DERING_LEVELS (6)
 extern const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS];
 #define OD_DERING_NBLOCKS (OD_BSIZE_MAX/8)
 #define OD_FILT_BORDER (3)
 #define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2*OD_FILT_BORDER)
 extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
 typedef void (*od_filter_dering_direction_func)(int16_t *y, int ystride,
 const int16_t *in, int threshold, int dir);
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
 const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
 struct od_dering_opt_vtbl {
  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
 };
 typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
 void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
 const dering_in *x, int xstride, int nvb, int nhb, int sbx, int sby, int nhsb,
 int nvsb, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
 unsigned char *bskip, int skip_stride, int threshold, int overlap,
 int coeff_shift);
 void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
 int ln, int threshold, int dir);
 void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
 const dering_in *x, int xstride, int ln, int threshold, int dir);
 extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
 void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
 const int16_t *in, int threshold, int dir);
 void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
 const int16_t *in, int threshold, int dir);
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
 const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
 const int16_t *in, const dering_in *x, int xstride, int threshold, int dir);
 #endif
--- a/vp10/common/odintrin.h
+++ b/vp10/common/odintrin.h
@ -0,0 +1,29 @@
 #include "vp10/common/enums.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/bitops.h"
 /*Smallest blocks are 4x4*/
 # define OD_LOG_BSIZE0 (2)
 /*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
 # define OD_NBSIZES    (5)
 /*The log of the maximum length of the side of a block.*/
 # define OD_LOG_BSIZE_MAX (OD_LOG_BSIZE0 + OD_NBSIZES - 1)
 /*The maximum length of the side of a block.*/
 # define OD_BSIZE_MAX     (1 << OD_LOG_BSIZE_MAX)
 typedef int od_coeff;
 typedef int16_t dering_in;
 #define OD_DIVU_SMALL(_x, _d) ((_x) / (_d))
 #define OD_MINI VPXMIN
 #define OD_CLAMPI(min, val, max) clamp((val), (min), (max))
 #  define OD_ILOG_NZ(x) get_msb(x)
 /*Note that __builtin_clz is not defined when x == 0, according to the gcc
 *    documentation (and that of the x86 BSR instruction that implements it), so
 *       we have to special-case it.
 *         We define a special version of the macro to use when x can be zero.*/
 #  define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@ -312,6 +312,9 @@ typedef struct VP10Common {
  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
  // each keyframe and not used afterwards
  vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
 #if CONFIG_DERING
  int dering_level;
 #endif
 } VP10_COMMON;
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@ -14,6 +14,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/bitreader.h"
@ -29,6 +30,9 @@
 #include "vp10/common/clpf.h"
 #endif
 #include "vp10/common/common.h"
 #if CONFIG_DERING
 #include "vp10/common/dering.h"
 #endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/idct.h"
@ -967,6 +971,17 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
 #if DERING_REFINEMENT
  if (bsize == BLOCK_64X64) {
    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain =
          vpx_read_literal(r, 2);
    } else {
      cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain = 0;
    }
  }
 #endif
 }
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@ -1103,6 +1118,12 @@ static void setup_clpf(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
 }
 #endif
 #if CONFIG_DERING
 static void setup_dering(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  cm->dering_level = vpx_rb_read_literal(rb,  DERING_LEVEL_BITS);
 }
 #endif  // CONFIG_DERING
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
  return vpx_rb_read_bit(rb)
             ? vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4)
@ -1603,6 +1624,11 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi, const uint8_t *data,
  if (cm->clpf && !cm->skip_loop_filter)
    vp10_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
 #endif
 #if CONFIG_DERING
  if (cm->dering_level && !cm->skip_loop_filter) {
    vp10_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
  }
 #endif  // CONFIG_DERING
  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
@ -2088,6 +2114,9 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  setup_loopfilter(&cm->lf, rb);
 #if CONFIG_CLPF
  setup_clpf(cm, rb);
 #endif
 #if CONFIG_DERING
  setup_dering(cm, rb);
 #endif
  setup_quantization(cm, rb);
 #if CONFIG_VPX_HIGHBITDEPTH
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@ -22,6 +22,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
 #if CONFIG_DERING
 #include "vp10/common/dering.h"
 #endif  // CONFIG_DERING
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/entropymv.h"
@ -602,6 +605,15 @@ static void write_modes_sb(VP10_COMP *cpi, const TileInfo *const tile,
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 #if DERING_REFINEMENT
  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
      !sb_all_skip(cm, mi_row, mi_col)) {
    vpx_write_literal(
        w, cm->mi_grid_visible[mi_row*cm->mi_stride + mi_col]->mbmi.dering_gain,
        2);
  }
 #endif
 }
 static void write_modes(VP10_COMP *cpi, const TileInfo *const tile,
@ -848,6 +860,12 @@ static void encode_clpf(const VP10_COMMON *cm,
 }
 #endif
 #if CONFIG_DERING
 static void encode_dering(int level, struct vpx_write_bit_buffer *wb) {
  vpx_wb_write_literal(wb, level, DERING_LEVEL_BITS);
 }
 #endif  // CONFIG_DERING
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
  if (delta_q != 0) {
    vpx_wb_write_bit(wb, 1);
@ -1317,6 +1335,9 @@ static void write_uncompressed_header(VP10_COMP *cpi,
 #if CONFIG_CLPF
  encode_clpf(cm, wb);
 #endif
 #if CONFIG_DERING
  encode_dering(cm->dering_level, wb);
 #endif  // CONFIG_DERING
  encode_quantization(cm, wb);
  encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -18,6 +18,9 @@
 #if CONFIG_CLPF
 #include "vp10/common/clpf.h"
 #endif
 #if CONFIG_DERING
 #include "vp10/common/dering.h"
 #endif  // CONFIG_DERING
 #include "vp10/common/filter.h"
 #include "vp10/common/idct.h"
 #include "vp10/common/reconinter.h"
@ -2432,6 +2435,16 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
      vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
  }
 #if CONFIG_DERING
  if (is_lossless_requested(&cpi->oxcf)) {
    cm->dering_level = 0;
  } else {
    cm->dering_level = vp10_dering_search(cm->frame_to_show, cpi->Source, cm,
                                          xd);
    vp10_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
  }
 #endif  // CONFIG_DERING
 #if CONFIG_CLPF
  cm->clpf = 0;
  if (!is_lossless_requested(&cpi->oxcf)) {
--- a/vp10/encoder/pickdering.c
+++ b/vp10/encoder/pickdering.c
@ -0,0 +1,180 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string.h>
 #include "./vpx_scale_rtcd.h"
 #include "vp10/common/dering.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/reconinter.h"
 #include "vp10/encoder/encoder.h"
 #include "vpx/vpx_integer.h"
 static double compute_dist(int16_t *x, int xstride, int16_t *y, int ystride,
    int nhb, int nvb, int coeff_shift) {
  int i, j;
  double sum;
  sum = 0;
  for (i = 0; i < nvb << 3; i++) {
    for (j = 0; j < nhb << 3; j++) {
      double tmp;
      tmp = x[i*xstride + j] - y[i*ystride + j];
      sum += tmp*tmp;
    }
  }
  return sum/(double)(1 << 2*coeff_shift);
 }
 int vp10_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                       VP10_COMMON *cm,
                       MACROBLOCKD *xd) {
  int r, c;
  int sbr, sbc;
  int nhsb, nvsb;
  dering_in *src;
  int16_t *ref_coeff;
  unsigned char *bskip;
  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = {{0}};
  int stride;
  int bsize[3];
  int dec[3];
  int pli;
  int (*mse)[MAX_DERING_LEVEL];
  int best_count[MAX_DERING_LEVEL] = {0};
  double tot_mse[MAX_DERING_LEVEL] = {0};
  int level;
  int best_level;
  int global_level;
  double best_tot_mse = 1e15;
  int coeff_shift = VPXMAX(cm->bit_depth - 8, 0);
  src = vpx_malloc(sizeof(*src)*cm->mi_rows*cm->mi_cols*64);
  ref_coeff = vpx_malloc(sizeof(*ref_coeff)*cm->mi_rows*cm->mi_cols*64);
  bskip = vpx_malloc(sizeof(*bskip)*cm->mi_rows*cm->mi_cols);
  vp10_setup_dst_planes(xd->plane, frame, 0, 0);
  for (pli = 0; pli < 3; pli++) {
    dec[pli] = xd->plane[pli].subsampling_x;
    bsize[pli] = 8 >> dec[pli];
  }
  stride = bsize[0]*cm->mi_cols;
  for (r = 0; r < bsize[0]*cm->mi_rows; ++r) {
    for (c = 0; c < bsize[0]*cm->mi_cols; ++c) {
 #if CONFIG_VPX_HIGHBITDEPTH
      if (cm->use_highbitdepth) {
        src[r * stride + c] =
            CONVERT_TO_SHORTPTR(xd->plane[0].dst.buf)
            [r*xd->plane[0].dst.stride + c];
        ref_coeff[r * stride + c] =
            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
      } else {
 #endif
        src[r * stride + c] =
            xd->plane[0].dst.buf[r*xd->plane[0].dst.stride + c];
        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
 #if CONFIG_VPX_HIGHBITDEPTH
      }
 #endif
    }
  }
  for (r = 0; r < cm->mi_rows; ++r) {
    for (c = 0; c < cm->mi_cols; ++c) {
      const MB_MODE_INFO *mbmi =
          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
      bskip[r * cm->mi_cols + c] = mbmi->skip;
    }
  }
  nvsb = (cm->mi_rows + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
  nhsb = (cm->mi_cols + MI_BLOCK_SIZE - 1)/MI_BLOCK_SIZE;
  mse = vpx_malloc(nvsb*nhsb*sizeof(*mse));
  for (sbr = 0; sbr < nvsb; sbr++) {
    for (sbc = 0; sbc < nhsb; sbc++) {
      int best_mse = 1000000000;
      int nvb, nhb;
      int16_t dst[MI_BLOCK_SIZE*MI_BLOCK_SIZE*8*8];
      best_level = 0;
      nhb = VPXMIN(MI_BLOCK_SIZE, cm->mi_cols - MI_BLOCK_SIZE*sbc);
      nvb = VPXMIN(MI_BLOCK_SIZE, cm->mi_rows - MI_BLOCK_SIZE*sbr);
      for (level = 0; level < 64; level++) {
        int threshold;
        threshold = level << coeff_shift;
        od_dering(
            &OD_DERING_VTBL_C,
            dst,
            MI_BLOCK_SIZE*bsize[0],
            &src[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
            sbc*bsize[0]*MI_BLOCK_SIZE],
            cm->mi_cols*bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, dir, 0,
            &bskip[MI_BLOCK_SIZE*sbr*cm->mi_cols + MI_BLOCK_SIZE*sbc],
            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
        mse[nhsb*sbr+sbc][level] = compute_dist(
            dst, MI_BLOCK_SIZE*bsize[0],
            &ref_coeff[sbr*stride*bsize[0]*MI_BLOCK_SIZE +
            sbc*bsize[0]*MI_BLOCK_SIZE],
            stride, nhb, nvb, coeff_shift);
        tot_mse[level] += mse[nhsb*sbr+sbc][level];
        if (mse[nhsb*sbr+sbc][level] < best_mse) {
          best_mse = mse[nhsb*sbr+sbc][level];
          best_level = level;
        }
      }
      best_count[best_level]++;
    }
  }
 #if DERING_REFINEMENT
  best_level = 0;
  /* Search for the best global level one value at a time. */
  for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
    double tot_mse = 0;
    for (sbr = 0; sbr < nvsb; sbr++) {
      for (sbc = 0; sbc < nhsb; sbc++) {
        int gi;
        int best_mse = mse[nhsb*sbr+sbc][0];
        for (gi = 1; gi < 4; gi++) {
          level = compute_level_from_index(global_level, gi);
          if (mse[nhsb*sbr+sbc][level] < best_mse) {
            best_mse = mse[nhsb*sbr+sbc][level];
          }
        }
        tot_mse += best_mse;
      }
    }
    if (tot_mse < best_tot_mse) {
      best_level = global_level;
      best_tot_mse = tot_mse;
    }
  }
  for (sbr = 0; sbr < nvsb; sbr++) {
    for (sbc = 0; sbc < nhsb; sbc++) {
      int gi;
      int best_gi;
      int best_mse = mse[nhsb*sbr+sbc][0];
      best_gi = 0;
      for (gi = 1; gi < 4; gi++) {
        level = compute_level_from_index(best_level, gi);
        if (mse[nhsb*sbr+sbc][level] < best_mse) {
          best_gi = gi;
          best_mse = mse[nhsb*sbr+sbc][level];
        }
      }
      cm->mi_grid_visible[MI_BLOCK_SIZE*sbr*cm->mi_stride + MI_BLOCK_SIZE*sbc]->
          mbmi.dering_gain = best_gi;
    }
  }
 #else
  best_level = 0;
  for (level = 0; level < MAX_DERING_LEVEL; level++) {
    if (tot_mse[level] < tot_mse[best_level]) best_level = level;
  }
 #endif
  vpx_free(src);
  vpx_free(ref_coeff);
  vpx_free(bskip);
  vpx_free(mse);
  return best_level;
 }
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@ -62,6 +62,10 @@ VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
 VP10_COMMON_SRCS-yes += common/clpf.c
 VP10_COMMON_SRCS-yes += common/clpf.h
 VP10_COMMON_SRCS-yes += common/od_dering.c
 VP10_COMMON_SRCS-yes += common/od_dering.h
 VP10_COMMON_SRCS-yes += common/dering.c
 VP10_COMMON_SRCS-yes += common/dering.h
 ifneq ($(CONFIG_VPX_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@ -79,6 +79,7 @@ VP10_CX_SRCS-yes += encoder/temporal_filter.c
 VP10_CX_SRCS-yes += encoder/temporal_filter.h
 VP10_CX_SRCS-yes += encoder/mbgraph.c
 VP10_CX_SRCS-yes += encoder/mbgraph.h
 VP10_CX_SRCS-yes += encoder/pickdering.c
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c