Remove vp9-postproc from configure

Change-Id: I601464f0b74183daa80730856dfbf33ddfce2cfe
2016-01-20 17:25:22 -08:00 · 2016-01-20 17:25:22 -08:00 · b89861a463
--- a/9
+++ b/9
@ -41,7 +41,6 @@ Advanced options:
  ${toggle_vp10}                  VP10 codec support
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_postproc}              postprocessing
  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@ -283,7 +282,6 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@ -346,7 +344,6 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@ -442,7 +439,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@ -626,10 +623,6 @@ process_toolchain() {
        enable_feature dc_recon
    fi
    if enabled internal_stats; then
        enable_feature vp9_postproc
    fi
    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
        vs*) enable_feature msvs
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@ -81,15 +81,6 @@ void vp10_free_ref_frame_buffers(BufferPool *pool) {
  }
 }
 void vp10_free_postproc_buffers(VP10_COMMON *cm) {
 #if CONFIG_VP9_POSTPROC
  vpx_free_frame_buffer(&cm->post_proc_buffer);
  vpx_free_frame_buffer(&cm->post_proc_buffer_int);
 #else
  (void)cm;
 #endif
 }
 void vp10_free_context_buffers(VP10_COMMON *cm) {
  cm->free_mi(cm);
  free_seg_map(cm);
--- a/vp10/common/mfqe.c
+++ b/vp10/common/mfqe.c
@ -1,394 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_config.h"
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/postproc.h"
 // TODO(jackychen): Replace this function with SSE2 code. There is
 // one SSE2 implementation in vp8, so will consider how to share it
 // between vp8 and vp9.
 static void filter_by_weight(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
                             int block_size, int src_weight) {
  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
  int r, c;
  for (r = 0; r < block_size; r++) {
    for (c = 0; c < block_size; c++) {
      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
               >> MFQE_PRECISION;
    }
    src += src_stride;
    dst += dst_stride;
  }
 }
 void vp10_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride, int src_weight) {
  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
 }
 void vp10_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride,
                                 int src_weight) {
  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
 }
 static void filter_by_weight32x32(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int weight) {
  vp10_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
  vp10_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
                            weight);
  vp10_filter_by_weight16x16(src + src_stride * 16, src_stride,
                            dst + dst_stride * 16, dst_stride, weight);
  vp10_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
                            dst + dst_stride * 16 + 16, dst_stride, weight);
 }
 static void filter_by_weight64x64(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int weight) {
  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
  filter_by_weight32x32(src + 32, src_stride, dst + 32,
                        dst_stride, weight);
  filter_by_weight32x32(src + src_stride * 32, src_stride,
                        dst + dst_stride * 32, dst_stride, weight);
  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
                        dst + dst_stride * 32 + 32, dst_stride, weight);
 }
 static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
                          int yd_stride, const uint8_t *u, const uint8_t *v,
                          int uv_stride, uint8_t *ud, uint8_t *vd,
                          int uvd_stride, BLOCK_SIZE block_size,
                          int weight) {
  if (block_size == BLOCK_16X16) {
    vp10_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
    vp10_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
    vp10_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
  } else if (block_size == BLOCK_32X32) {
    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
    vp10_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
    vp10_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
  } else if (block_size == BLOCK_64X64) {
    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
  }
 }
 // TODO(jackychen): Determine whether replace it with assembly code.
 static void copy_mem8x8(const uint8_t *src, int src_stride,
                        uint8_t *dst, int dst_stride) {
  int r;
  for (r = 0; r < 8; r++) {
    memcpy(dst, src, 8);
    src += src_stride;
    dst += dst_stride;
  }
 }
 static void copy_mem16x16(const uint8_t *src, int src_stride,
                          uint8_t *dst, int dst_stride) {
  int r;
  for (r = 0; r < 16; r++) {
    memcpy(dst, src, 16);
    src += src_stride;
    dst += dst_stride;
  }
 }
 static void copy_mem32x32(const uint8_t *src, int src_stride,
                          uint8_t *dst, int dst_stride) {
  copy_mem16x16(src, src_stride, dst, dst_stride);
  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
  copy_mem16x16(src + src_stride * 16, src_stride,
                dst + dst_stride * 16, dst_stride);
  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
                dst + dst_stride * 16 + 16, dst_stride);
 }
 void copy_mem64x64(const uint8_t *src, int src_stride,
                   uint8_t *dst, int dst_stride) {
  copy_mem32x32(src, src_stride, dst, dst_stride);
  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
  copy_mem32x32(src + src_stride * 32, src_stride,
                dst + src_stride * 32, dst_stride);
  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
                dst + src_stride * 32 + 32, dst_stride);
 }
 static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
                       uint8_t *vd, int yd_stride, int uvd_stride,
                       BLOCK_SIZE bs) {
  if (bs == BLOCK_16X16) {
    copy_mem16x16(y, y_stride, yd, yd_stride);
    copy_mem8x8(u, uv_stride, ud, uvd_stride);
    copy_mem8x8(v, uv_stride, vd, uvd_stride);
  } else if (bs == BLOCK_32X32) {
    copy_mem32x32(y, y_stride, yd, yd_stride);
    copy_mem16x16(u, uv_stride, ud, uvd_stride);
    copy_mem16x16(v, uv_stride, vd, uvd_stride);
  } else {
    copy_mem64x64(y, y_stride, yd, yd_stride);
    copy_mem32x32(u, uv_stride, ud, uvd_stride);
    copy_mem32x32(v, uv_stride, vd, uvd_stride);
  }
 }
 static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
  const int adj = qdiff >> MFQE_PRECISION;
  if (bs == BLOCK_16X16) {
    *sad_thr = 7 + adj;
  } else if (bs == BLOCK_32X32) {
    *sad_thr = 6 + adj;
  } else {  // BLOCK_64X64
    *sad_thr = 5 + adj;
  }
  *vdiff_thr = 125 + qdiff;
 }
 static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
                       const uint8_t *v, int y_stride, int uv_stride,
                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
                       int uvd_stride, int qdiff) {
  int sad, sad_thr, vdiff, vdiff_thr;
  uint32_t sse;
  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
  if (bs == BLOCK_16X16) {
    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
  } else if (bs == BLOCK_32X32) {
    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
  } else /* if (bs == BLOCK_64X64) */ {
    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
  }
  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
  // it might be a lighting change in smooth area. When there is a
  // lighting change in smooth area, it is dangerous to do MFQE.
  if (sad > 1 && vdiff > sad * 3) {
    const int weight = 1 << MFQE_PRECISION;
    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
    // When ifactor equals weight, no MFQE is done.
    if (ifactor > weight) {
      ifactor = weight;
    }
    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
                  uvd_stride, bs, ifactor);
  } else {
    // Copy the block from current frame (i.e., no mfqe is done).
    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
               yd_stride, uvd_stride, bs);
  }
 }
 static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
  // Check the motion in current block(for inter frame),
  // or check the motion in the correlated block in last frame (for keyframe).
  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
                            mi->mbmi.mv[0].as_mv.row +
                            mi->mbmi.mv[0].as_mv.col *
                            mi->mbmi.mv[0].as_mv.col;
  const int mv_threshold = 100;
  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
         cur_bs >= BLOCK_16X16 &&
         mv_len_square <= mv_threshold;
 }
 // Process each partiton in a super block, recursively.
 static void mfqe_partition(VP10_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
                           const uint8_t *y, const uint8_t *u,
                           const uint8_t *v, int y_stride, int uv_stride,
                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
                           int yd_stride, int uvd_stride) {
  int mi_offset, y_offset, uv_offset;
  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
  const int bsl = b_width_log2_lookup[bs];
  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
  const BLOCK_SIZE subsize = get_subsize(bs, partition);
  if (cur_bs < BLOCK_8X8) {
    // If there are blocks smaller than 8x8, it must be on the boundary.
    return;
  }
  // No MFQE on blocks smaller than 16x16
  if (bs == BLOCK_16X16) {
    partition = PARTITION_NONE;
  }
  if (bs == BLOCK_64X64) {
    mi_offset = 4;
    y_offset = 32;
    uv_offset = 16;
  } else {
    mi_offset = 2;
    y_offset = 16;
    uv_offset = 8;
  }
  switch (partition) {
    BLOCK_SIZE mfqe_bs, bs_tmp;
    case PARTITION_HORZ:
      if (bs == BLOCK_64X64) {
        mfqe_bs = BLOCK_64X32;
        bs_tmp = BLOCK_32X32;
      } else {
        mfqe_bs = BLOCK_32X16;
        bs_tmp = BLOCK_16X16;
      }
      if (mfqe_decision(mi, mfqe_bs)) {
        // Do mfqe on the first square partition.
        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
        // Do mfqe on the second square partition.
        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
      }
      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
        // Do mfqe on the first square partition.
        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
                   v + uv_offset * uv_stride, y_stride, uv_stride,
                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
        // Do mfqe on the second square partition.
        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
                   u + uv_offset * uv_stride + uv_offset,
                   v + uv_offset * uv_stride + uv_offset, y_stride,
                   uv_stride, yd + y_offset * yd_stride + y_offset,
                   ud + uv_offset * uvd_stride + uv_offset,
                   vd + uv_offset * uvd_stride + uv_offset,
                   yd_stride, uvd_stride, qdiff);
      }
      break;
    case PARTITION_VERT:
      if (bs == BLOCK_64X64) {
        mfqe_bs = BLOCK_32X64;
        bs_tmp = BLOCK_32X32;
      } else {
        mfqe_bs = BLOCK_16X32;
        bs_tmp = BLOCK_16X16;
      }
      if (mfqe_decision(mi, mfqe_bs)) {
        // Do mfqe on the first square partition.
        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
        // Do mfqe on the second square partition.
        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
                   v + uv_offset * uv_stride, y_stride, uv_stride,
                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
      }
      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
        // Do mfqe on the first square partition.
        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
        // Do mfqe on the second square partition.
        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
                   u + uv_offset * uv_stride + uv_offset,
                   v + uv_offset * uv_stride + uv_offset, y_stride,
                   uv_stride, yd + y_offset * yd_stride + y_offset,
                   ud + uv_offset * uvd_stride + uv_offset,
                   vd + uv_offset * uvd_stride + uv_offset,
                   yd_stride, uvd_stride, qdiff);
      }
      break;
    case PARTITION_NONE:
      if (mfqe_decision(mi, cur_bs)) {
        // Do mfqe on this partition.
        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
      } else {
        // Copy the block from current frame(i.e., no mfqe is done).
        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
                   yd_stride, uvd_stride, bs);
      }
      break;
    case PARTITION_SPLIT:
      // Recursion on four square partitions, e.g. if bs is 64X64,
      // then look into four 32X32 blocks in it.
      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
                     yd_stride, uvd_stride);
      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
                     y + y_offset * y_stride, u + uv_offset * uv_stride,
                     v + uv_offset * uv_stride, y_stride, uv_stride,
                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
                     subsize, y + y_offset * y_stride + y_offset,
                     u + uv_offset * uv_stride + uv_offset,
                     v + uv_offset * uv_stride + uv_offset, y_stride,
                     uv_stride, yd + y_offset * yd_stride + y_offset,
                     ud + uv_offset * uvd_stride + uv_offset,
                     vd + uv_offset * uvd_stride + uv_offset,
                     yd_stride, uvd_stride);
      break;
    default:
      assert(0);
  }
 }
 void vp10_mfqe(VP10_COMMON *cm) {
  int mi_row, mi_col;
  // Current decoded frame.
  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
  // Last decoded frame and will store the MFQE result.
  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
  // Loop through each super block.
  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
      MODE_INFO *mi;
      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
      // Motion Info in last frame.
      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
                           (mi_row * cm->mi_stride + mi_col);
      const uint32_t y_stride = show->y_stride;
      const uint32_t uv_stride = show->uv_stride;
      const uint32_t yd_stride = dest->y_stride;
      const uint32_t uvd_stride = dest->uv_stride;
      const uint32_t row_offset_y = mi_row << 3;
      const uint32_t row_offset_uv = mi_row << 2;
      const uint32_t col_offset_y = mi_col << 3;
      const uint32_t col_offset_uv = mi_col << 2;
      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
                         col_offset_y;
      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
                         col_offset_uv;
      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
                         col_offset_uv;
      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
                    col_offset_uv;
      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
                    col_offset_uv;
      if (frame_is_intra_only(cm)) {
        mi = mi_prev;
      } else {
        mi = mi_local;
      }
      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
                     vd, yd_stride, uvd_stride);
    }
  }
 }
--- a/vp10/common/mfqe.h
+++ b/vp10/common/mfqe.h
@ -1,31 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP10_COMMON_MFQE_H_
 #define VP10_COMMON_MFQE_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Multiframe Quality Enhancement.
 // The aim for MFQE is to replace pixel blocks in the current frame with
 // the correlated pixel blocks (with higher quality) in the last frame.
 // The replacement can only be taken in stationary blocks by checking
 // the motion of the blocks and other conditions such as the SAD of
 // the current block and correlated block, the variance of the block
 // difference, etc.
 void vp10_mfqe(struct VP10Common *cm);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP10_COMMON_MFQE_H_
--- a/vp10/common/mips/msa/mfqe_msa.c
+++ b/vp10/common/mips/msa/mfqe_msa.c
@ -1,137 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp10_rtcd.h"
 #include "vp10/common/onyxc_int.h"
 #include "vpx_dsp/mips/macros_msa.h"
 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
                                    uint8_t *dst_ptr, int32_t dst_stride,
                                    int32_t src_weight) {
  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
  int32_t row;
  uint64_t src0_d, src1_d, dst0_d, dst1_d;
  v16i8 src0 = { 0 };
  v16i8 src1 = { 0 };
  v16i8 dst0 = { 0 };
  v16i8 dst1 = { 0 };
  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
  src_wt = __msa_fill_h(src_weight);
  dst_wt = __msa_fill_h(dst_weight);
  for (row = 2; row--;) {
    LD2(src_ptr, src_stride, src0_d, src1_d);
    src_ptr += (2 * src_stride);
    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
    INSERT_D2_SB(src0_d, src1_d, src0);
    INSERT_D2_SB(dst0_d, dst1_d, dst0);
    LD2(src_ptr, src_stride, src0_d, src1_d);
    src_ptr += (2 * src_stride);
    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
    INSERT_D2_SB(src0_d, src1_d, src1);
    INSERT_D2_SB(dst0_d, dst1_d, dst1);
    UNPCK_UB_SH(src0, src_r, src_l);
    UNPCK_UB_SH(dst0, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
    ST8x2_UB(dst0, dst_ptr, dst_stride);
    dst_ptr += (2 * dst_stride);
    UNPCK_UB_SH(src1, src_r, src_l);
    UNPCK_UB_SH(dst1, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
    ST8x2_UB(dst1, dst_ptr, dst_stride);
    dst_ptr += (2 * dst_stride);
  }
 }
 static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
                                      int32_t src_stride,
                                      uint8_t *dst_ptr,
                                      int32_t dst_stride,
                                      int32_t src_weight) {
  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
  int32_t row;
  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
  src_wt = __msa_fill_h(src_weight);
  dst_wt = __msa_fill_h(dst_weight);
  for (row = 4; row--;) {
    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
    src_ptr += (4 * src_stride);
    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
    UNPCK_UB_SH(src0, src_r, src_l);
    UNPCK_UB_SH(dst0, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    dst_ptr += dst_stride;
    UNPCK_UB_SH(src1, src_r, src_l);
    UNPCK_UB_SH(dst1, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    dst_ptr += dst_stride;
    UNPCK_UB_SH(src2, src_r, src_l);
    UNPCK_UB_SH(dst2, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    dst_ptr += dst_stride;
    UNPCK_UB_SH(src3, src_r, src_l);
    UNPCK_UB_SH(dst3, dst_r, dst_l);
    res_h_r = (src_r * src_wt);
    res_h_r += (dst_r * dst_wt);
    res_h_l = (src_l * src_wt);
    res_h_l += (dst_l * dst_wt);
    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    dst_ptr += dst_stride;
  }
 }
 void vp10_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride,
                                 int src_weight) {
  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
 }
 void vp10_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride,
                                   int src_weight) {
  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
 }
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@ -24,10 +24,6 @@
 #include "vp10/common/quant_common.h"
 #include "vp10/common/tile_common.h"
 #if CONFIG_VP9_POSTPROC
 #include "vp10/common/postproc.h"
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -167,11 +163,6 @@ typedef struct VP10Common {
  int new_fb_idx;
 #if CONFIG_VP9_POSTPROC
  YV12_BUFFER_CONFIG post_proc_buffer;
  YV12_BUFFER_CONFIG post_proc_buffer_int;
 #endif
  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
  FRAME_TYPE frame_type;
@ -275,10 +266,6 @@ typedef struct VP10Common {
  vpx_bit_depth_t bit_depth;
  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 #if CONFIG_VP9_POSTPROC
  struct postproc_state  postproc_state;
 #endif
  int error_resilient_mode;
  int log2_tile_cols, log2_tile_rows;
--- a/vp10/common/postproc.c
+++ b/vp10/common/postproc.c
@ -1,746 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp10_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/postproc.h"
 #include "vp10/common/textblit.h"
 #if CONFIG_VP9_POSTPROC
 static const short kernel5[] = {
  1, 1, 4, 1, 1
 };
 const short vp10_rv[] = {
  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 static const uint8_t q_diff_thresh = 20;
 static const uint8_t last_q_thresh = 170;
 void vp10_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                     uint8_t *dst_ptr,
                                     int src_pixels_per_line,
                                     int dst_pixels_per_line,
                                     int rows,
                                     int cols,
                                     int flimit) {
  uint8_t const *p_src;
  uint8_t *p_dst;
  int row, col, i, v, kernel;
  int pitch = src_pixels_per_line;
  uint8_t d[8];
  (void)dst_pixels_per_line;
  for (row = 0; row < rows; row++) {
    /* post_proc_down for one row */
    p_src = src_ptr;
    p_dst = dst_ptr;
    for (col = 0; col < cols; col++) {
      kernel = 4;
      v = p_src[col];
      for (i = -2; i <= 2; i++) {
        if (abs(v - p_src[col + i * pitch]) > flimit)
          goto down_skip_convolve;
        kernel += kernel5[2 + i] * p_src[col + i * pitch];
      }
      v = (kernel >> 3);
    down_skip_convolve:
      p_dst[col] = v;
    }
    /* now post_proc_across */
    p_src = dst_ptr;
    p_dst = dst_ptr;
    for (i = 0; i < 8; i++)
      d[i] = p_src[i];
    for (col = 0; col < cols; col++) {
      kernel = 4;
      v = p_src[col];
      d[col & 7] = v;
      for (i = -2; i <= 2; i++) {
        if (abs(v - p_src[col + i]) > flimit)
          goto across_skip_convolve;
        kernel += kernel5[2 + i] * p_src[col + i];
      }
      d[col & 7] = (kernel >> 3);
    across_skip_convolve:
      if (col >= 2)
        p_dst[col - 2] = d[(col - 2) & 7];
    }
    /* handle the last two pixels */
    p_dst[col - 2] = d[(col - 2) & 7];
    p_dst[col - 1] = d[(col - 1) & 7];
    /* next row */
    src_ptr += pitch;
    dst_ptr += pitch;
  }
 }
 #if CONFIG_VPX_HIGHBITDEPTH
 void vp10_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
                                            uint16_t *dst_ptr,
                                            int src_pixels_per_line,
                                            int dst_pixels_per_line,
                                            int rows,
                                            int cols,
                                            int flimit) {
  uint16_t const *p_src;
  uint16_t *p_dst;
  int row, col, i, v, kernel;
  int pitch = src_pixels_per_line;
  uint16_t d[8];
  for (row = 0; row < rows; row++) {
    // post_proc_down for one row.
    p_src = src_ptr;
    p_dst = dst_ptr;
    for (col = 0; col < cols; col++) {
      kernel = 4;
      v = p_src[col];
      for (i = -2; i <= 2; i++) {
        if (abs(v - p_src[col + i * pitch]) > flimit)
          goto down_skip_convolve;
        kernel += kernel5[2 + i] * p_src[col + i * pitch];
      }
      v = (kernel >> 3);
    down_skip_convolve:
      p_dst[col] = v;
    }
    /* now post_proc_across */
    p_src = dst_ptr;
    p_dst = dst_ptr;
    for (i = 0; i < 8; i++)
      d[i] = p_src[i];
    for (col = 0; col < cols; col++) {
      kernel = 4;
      v = p_src[col];
      d[col & 7] = v;
      for (i = -2; i <= 2; i++) {
        if (abs(v - p_src[col + i]) > flimit)
          goto across_skip_convolve;
        kernel += kernel5[2 + i] * p_src[col + i];
      }
      d[col & 7] = (kernel >> 3);
    across_skip_convolve:
      if (col >= 2)
        p_dst[col - 2] = d[(col - 2) & 7];
    }
    /* handle the last two pixels */
    p_dst[col - 2] = d[(col - 2) & 7];
    p_dst[col - 1] = d[(col - 1) & 7];
    /* next row */
    src_ptr += pitch;
    dst_ptr += dst_pixels_per_line;
  }
 }
 #endif  // CONFIG_VPX_HIGHBITDEPTH
 static int q2mbl(int x) {
  if (x < 20) x = 20;
  x = 50 + (x - 50) * 10 / 8;
  return x * x / 3;
 }
 void vp10_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                 int rows, int cols, int flimit) {
  int r, c, i;
  uint8_t *s = src;
  uint8_t d[16];
  for (r = 0; r < rows; r++) {
    int sumsq = 0;
    int sum = 0;
    for (i = -8; i <= 6; i++) {
      sumsq += s[i] * s[i];
      sum += s[i];
      d[i + 8] = 0;
    }
    for (c = 0; c < cols + 8; c++) {
      int x = s[c + 7] - s[c - 8];
      int y = s[c + 7] + s[c - 8];
      sum += x;
      sumsq += x * y;
      d[c & 15] = s[c];
      if (sumsq * 15 - sum * sum < flimit) {
        d[c & 15] = (8 + sum + s[c]) >> 4;
      }
      s[c - 8] = d[(c - 8) & 15];
    }
    s += pitch;
  }
 }
 #if CONFIG_VPX_HIGHBITDEPTH
 void vp10_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
                                        int rows, int cols, int flimit) {
  int r, c, i;
  uint16_t *s = src;
  uint16_t d[16];
  for (r = 0; r < rows; r++) {
    int sumsq = 0;
    int sum   = 0;
    for (i = -8; i <= 6; i++) {
      sumsq += s[i] * s[i];
      sum   += s[i];
      d[i + 8] = 0;
    }
    for (c = 0; c < cols + 8; c++) {
      int x = s[c + 7] - s[c - 8];
      int y = s[c + 7] + s[c - 8];
      sum  += x;
      sumsq += x * y;
      d[c & 15] = s[c];
      if (sumsq * 15 - sum * sum < flimit) {
        d[c & 15] = (8 + sum + s[c]) >> 4;
      }
      s[c - 8] = d[(c - 8) & 15];
    }
    s += pitch;
  }
 }
 #endif  // CONFIG_VPX_HIGHBITDEPTH
 void vp10_mbpost_proc_down_c(uint8_t *dst, int pitch,
                            int rows, int cols, int flimit) {
  int r, c, i;
  const short *rv3 = &vp10_rv[63 & rand()]; // NOLINT
  for (c = 0; c < cols; c++) {
    uint8_t *s = &dst[c];
    int sumsq = 0;
    int sum   = 0;
    uint8_t d[16];
    const short *rv2 = rv3 + ((c * 17) & 127);
    for (i = -8; i <= 6; i++) {
      sumsq += s[i * pitch] * s[i * pitch];
      sum   += s[i * pitch];
    }
    for (r = 0; r < rows + 8; r++) {
      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
      sum  += s[7 * pitch] - s[-8 * pitch];
      d[r & 15] = s[0];
      if (sumsq * 15 - sum * sum < flimit) {
        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
      }
      s[-8 * pitch] = d[(r - 8) & 15];
      s += pitch;
    }
  }
 }
 #if CONFIG_VPX_HIGHBITDEPTH
 void vp10_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
                                   int rows, int cols, int flimit) {
  int r, c, i;
  const int16_t *rv3 = &vp10_rv[63 & rand()];  // NOLINT
  for (c = 0; c < cols; c++) {
    uint16_t *s = &dst[c];
    int sumsq = 0;
    int sum = 0;
    uint16_t d[16];
    const int16_t *rv2 = rv3 + ((c * 17) & 127);
    for (i = -8; i <= 6; i++) {
      sumsq += s[i * pitch] * s[i * pitch];
      sum += s[i * pitch];
    }
    for (r = 0; r < rows + 8; r++) {
      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
      sum += s[7 * pitch] - s[-8 * pitch];
      d[r & 15] = s[0];
      if (sumsq * 15 - sum * sum < flimit) {
        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
      }
      s[-8 * pitch] = d[(r - 8) & 15];
      s += pitch;
    }
  }
 }
 #endif  // CONFIG_VPX_HIGHBITDEPTH
 static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
                                       YV12_BUFFER_CONFIG   *post,
                                       int                   q,
                                       int                   low_var_thresh,
                                       int                   flag) {
  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
  int ppl = (int)(level + .5);
  (void) low_var_thresh;
  (void) flag;
 #if CONFIG_VPX_HIGHBITDEPTH
  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
                                         CONVERT_TO_SHORTPTR(post->y_buffer),
                                         source->y_stride, post->y_stride,
                                         source->y_height, source->y_width,
                                         ppl);
    vp10_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
                                     post->y_stride, post->y_height,
                                     post->y_width, q2mbl(q));
    vp10_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
                                post->y_stride, post->y_height,
                                post->y_width, q2mbl(q));
    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
                                         CONVERT_TO_SHORTPTR(post->u_buffer),
                                         source->uv_stride, post->uv_stride,
                                         source->uv_height, source->uv_width,
                                         ppl);
    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
                                         CONVERT_TO_SHORTPTR(post->v_buffer),
                                         source->uv_stride, post->uv_stride,
                                         source->uv_height, source->uv_width,
                                         ppl);
  } else {
    vp10_post_proc_down_and_across(source->y_buffer, post->y_buffer,
                                  source->y_stride, post->y_stride,
                                  source->y_height, source->y_width, ppl);
    vp10_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
                              post->y_width, q2mbl(q));
    vp10_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
                         post->y_width, q2mbl(q));
    vp10_post_proc_down_and_across(source->u_buffer, post->u_buffer,
                                  source->uv_stride, post->uv_stride,
                                  source->uv_height, source->uv_width, ppl);
    vp10_post_proc_down_and_across(source->v_buffer, post->v_buffer,
                                  source->uv_stride, post->uv_stride,
                                  source->uv_height, source->uv_width, ppl);
  }
 #else
  vp10_post_proc_down_and_across(source->y_buffer, post->y_buffer,
                                source->y_stride, post->y_stride,
                                source->y_height, source->y_width, ppl);
  vp10_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
                            post->y_width, q2mbl(q));
  vp10_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
                       post->y_width, q2mbl(q));
  vp10_post_proc_down_and_across(source->u_buffer, post->u_buffer,
                                source->uv_stride, post->uv_stride,
                                source->uv_height, source->uv_width, ppl);
  vp10_post_proc_down_and_across(source->v_buffer, post->v_buffer,
                                source->uv_stride, post->uv_stride,
                                source->uv_height, source->uv_width, ppl);
 #endif  // CONFIG_VPX_HIGHBITDEPTH
 }
 void vp10_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                 int q) {
  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
                        + 0.0065 + 0.5);
  int i;
  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
  for (i = 0; i < MAX_MB_PLANE; ++i) {
 #if CONFIG_VPX_HIGHBITDEPTH
    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
      vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
                                           CONVERT_TO_SHORTPTR(dsts[i]),
                                           src_strides[i], dst_strides[i],
                                           src_heights[i], src_widths[i], ppl);
    } else {
      vp10_post_proc_down_and_across(srcs[i], dsts[i],
                                    src_strides[i], dst_strides[i],
                                    src_heights[i], src_widths[i], ppl);
    }
 #else
    vp10_post_proc_down_and_across(srcs[i], dsts[i],
                                  src_strides[i], dst_strides[i],
                                  src_heights[i], src_widths[i], ppl);
 #endif  // CONFIG_VPX_HIGHBITDEPTH
  }
 }
 void vp10_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                 int q) {
  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
                        + 0.0065 + 0.5);
  int i;
  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
  for (i = 0; i < MAX_MB_PLANE; ++i) {
    const int src_stride = src_strides[i];
    const int src_width = src_widths[i] - 4;
    const int src_height = src_heights[i] - 4;
    const int dst_stride = dst_strides[i];
 #if CONFIG_VPX_HIGHBITDEPTH
    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
          srcs[i] + 2 * src_stride + 2);
      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
          dsts[i] + 2 * dst_stride + 2);
      vp10_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
                                           dst_stride, src_height, src_width,
                                           ppl);
    } else {
      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
      vp10_post_proc_down_and_across(src_plane, dst_plane, src_stride,
                                    dst_stride, src_height, src_width, ppl);
    }
 #else
    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
    vp10_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
                                  src_height, src_width, ppl);
 #endif
  }
 }
 static double gaussian(double sigma, double mu, double x) {
  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
 static void fillrd(struct postproc_state *state, int q, int a) {
  char char_dist[300];
  double sigma;
  int ai = a, qi = q, i;
  vpx_clear_system_state();
  sigma = ai + .5 + .6 * (63 - qi) / 63.0;
  /* set up a lookup table of 256 entries that matches
   * a gaussian distribution with sigma determined by q.
   */
  {
    int next, j;
    next = 0;
    for (i = -32; i < 32; i++) {
      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
      if (a_i) {
        for (j = 0; j < a_i; j++) {
          char_dist[next + j] = (char) i;
        }
        next = next + j;
      }
    }
    for (; next < 256; next++)
      char_dist[next] = 0;
  }
  for (i = 0; i < 3072; i++) {
    state->noise[i] = char_dist[rand() & 0xff];  // NOLINT
  }
  for (i = 0; i < 16; i++) {
    state->blackclamp[i] = -char_dist[0];
    state->whiteclamp[i] = -char_dist[0];
    state->bothclamp[i] = -2 * char_dist[0];
  }
  state->last_q = q;
  state->last_noise = a;
 }
 void vp10_plane_add_noise_c(uint8_t *start, char *noise,
                           char blackclamp[16],
                           char whiteclamp[16],
                           char bothclamp[16],
                           unsigned int width, unsigned int height, int pitch) {
  unsigned int i, j;
  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
  // fix..
  (void) bothclamp;
  for (i = 0; i < height; i++) {
    uint8_t *pos = start + i * pitch;
    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
    for (j = 0; j < width; j++) {
      if (pos[j] < blackclamp[0])
        pos[j] = blackclamp[0];
      if (pos[j] > 255 + whiteclamp[0])
        pos[j] = 255 + whiteclamp[0];
      pos[j] += ref[j];
    }
  }
 }
 static void swap_mi_and_prev_mi(VP10_COMMON *cm) {
  // Current mip will be the prev_mip for the next frame.
  MODE_INFO *temp = cm->postproc_state.prev_mip;
  cm->postproc_state.prev_mip = cm->mip;
  cm->mip = temp;
  // Update the upper left visible macroblock ptrs.
  cm->mi = cm->mip + cm->mi_stride + 1;
  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
 }
 int vp10_post_proc_frame(struct VP10Common *cm,
                        YV12_BUFFER_CONFIG *dest, vp10_ppflags_t *ppflags) {
  const int q = VPXMIN(105, cm->lf.filter_level * 2);
  const int flags = ppflags->post_proc_flag;
  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
  struct postproc_state *const ppstate = &cm->postproc_state;
  if (!cm->frame_to_show)
    return -1;
  if (!flags) {
    *dest = *cm->frame_to_show;
    return 0;
  }
  vpx_clear_system_state();
  // Alloc memory for prev_mip in the first frame.
  if (cm->current_video_frame == 1) {
    cm->postproc_state.last_base_qindex = cm->base_qindex;
    cm->postproc_state.last_frame_valid = 1;
    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
    if (!ppstate->prev_mip) {
      return 1;
    }
    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
    memset(ppstate->prev_mip, 0,
           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
  }
  // Allocate post_proc_buffer_int if needed.
  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
      if (vpx_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
                                 cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VPX_HIGHBITDEPTH
                                 cm->use_highbitdepth,
 #endif  // CONFIG_VPX_HIGHBITDEPTH
                                 VPX_ENC_BORDER_IN_PIXELS,
                                 cm->byte_alignment) < 0) {
        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate MFQE framebuffer");
      }
      // Ensure that postproc is set to all 0s so that post proc
      // doesn't pull random data in from edge.
      memset(cm->post_proc_buffer_int.buffer_alloc, 128,
             cm->post_proc_buffer.frame_size);
    }
  }
  if (vpx_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                               cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VPX_HIGHBITDEPTH
                               cm->use_highbitdepth,
 #endif
                               VPX_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
                               NULL, NULL, NULL) < 0)
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate post-processing buffer");
  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
      cm->postproc_state.last_base_qindex <= last_q_thresh &&
      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
    vp10_mfqe(cm);
    // TODO(jackychen): Consider whether enable deblocking by default
    // if mfqe is enabled. Need to take both the quality and the speed
    // into consideration.
    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
    }
    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
                                 q + (ppflags->deblocking_level - 5) * 10,
                                 1, 0);
    } else if (flags & VP9D_DEBLOCK) {
      vp10_deblock(&cm->post_proc_buffer_int, ppbuf, q);
    } else {
      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
    }
  } else if (flags & VP9D_DEMACROBLOCK) {
    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
  } else if (flags & VP9D_DEBLOCK) {
    vp10_deblock(cm->frame_to_show, ppbuf, q);
  } else {
    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
  }
  cm->postproc_state.last_base_qindex = cm->base_qindex;
  cm->postproc_state.last_frame_valid = 1;
  if (flags & VP9D_ADDNOISE) {
    const int noise_level = ppflags->noise_level;
    if (ppstate->last_q != q ||
        ppstate->last_noise != noise_level) {
      fillrd(ppstate, 63 - q, noise_level);
    }
    vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                        ppstate->whiteclamp, ppstate->bothclamp,
                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
  }
  *dest = *ppbuf;
  /* handle problem with extending borders */
  dest->y_width = cm->width;
  dest->y_height = cm->height;
  dest->uv_width = dest->y_width >> cm->subsampling_x;
  dest->uv_height = dest->y_height >> cm->subsampling_y;
  swap_mi_and_prev_mi(cm);
  return 0;
 }
 #endif  // CONFIG_VP9_POSTPROC
--- a/vp10/common/postproc.h
+++ b/vp10/common/postproc.h
@ -1,53 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP10_COMMON_POSTPROC_H_
 #define VP10_COMMON_POSTPROC_H_
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
 #include "vp10/common/blockd.h"
 #include "vp10/common/mfqe.h"
 #include "vp10/common/ppflags.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct postproc_state {
  int last_q;
  int last_noise;
  char noise[3072];
  int last_base_qindex;
  int last_frame_valid;
  MODE_INFO *prev_mip;
  MODE_INFO *prev_mi;
  DECLARE_ALIGNED(16, char, blackclamp[16]);
  DECLARE_ALIGNED(16, char, whiteclamp[16]);
  DECLARE_ALIGNED(16, char, bothclamp[16]);
 };
 struct VP10Common;
 #define MFQE_PRECISION 4
 int vp10_post_proc_frame(struct VP10Common *cm,
                        YV12_BUFFER_CONFIG *dest, vp10_ppflags_t *flags);
 void vp10_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 void vp10_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP10_COMMON_POSTPROC_H_
--- a/vp10/common/ppflags.h
+++ b/vp10/common/ppflags.h
@ -1,43 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP10_COMMON_PPFLAGS_H_
 #define VP10_COMMON_PPFLAGS_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum {
  VP9D_NOFILTERING            = 0,
  VP9D_DEBLOCK                = 1 << 0,
  VP9D_DEMACROBLOCK           = 1 << 1,
  VP9D_ADDNOISE               = 1 << 2,
  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,
  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,
  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,
  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
  VP9D_DEBUG_DRAW_MV          = 1 << 7,
  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
  VP9D_MFQE                   = 1 << 10
 };
 typedef struct {
  int post_proc_flag;
  int deblocking_level;
  int noise_level;
 } vp10_ppflags_t;
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP10_COMMON_PPFLAGS_H_
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@ -54,33 +54,6 @@ if ($opts{arch} eq "x86_64") {
  $avx2_x86_64 = 'avx2';
 }
 #
 # post proc
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 add_proto qw/void vp10_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
 specialize qw/vp10_mbpost_proc_down sse2/;
 $vp10_mbpost_proc_down_sse2=vp10_mbpost_proc_down_xmm;
 add_proto qw/void vp10_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
 specialize qw/vp10_mbpost_proc_across_ip sse2/;
 $vp10_mbpost_proc_across_ip_sse2=vp10_mbpost_proc_across_ip_xmm;
 add_proto qw/void vp10_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
 specialize qw/vp10_post_proc_down_and_across sse2/;
 $vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;
 add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
 specialize qw/vp10_plane_add_noise sse2/;
 $vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
 add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp10_filter_by_weight16x16 sse2 msa/;
 add_proto qw/void vp10_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
 }
 #
 # dct
 #
--- a/vp10/common/x86/mfqe_sse2.asm
+++ b/vp10/common/x86/mfqe_sse2.asm
@ -1,287 +0,0 @@
 ;
 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 ;  This file is a duplicate of mfqe_sse2.asm in VP8.
 ;  TODO(jackychen): Find a way to fix the duplicate.
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp10_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
 ;    int            src_stride,
 ;    unsigned char *dst,
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
 global sym(vp10_filter_by_weight16x16_sse2) PRIVATE
 sym(vp10_filter_by_weight16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight
    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride
    mov         rcx, 16                     ; loop count
    pxor        xmm6, xmm6
 .combine
    movdqa      xmm2, [rax]
    movdqa      xmm4, [rdx]
    add         rax, rsi
    ; src * src_weight
    movdqa      xmm3, xmm2
    punpcklbw   xmm2, xmm6
    punpckhbw   xmm3, xmm6
    pmullw      xmm2, xmm0
    pmullw      xmm3, xmm0
    ; dst * dst_weight
    movdqa      xmm5, xmm4
    punpcklbw   xmm4, xmm6
    punpckhbw   xmm5, xmm6
    pmullw      xmm4, xmm1
    pmullw      xmm5, xmm1
    ; sum, round and shift
    paddw       xmm2, xmm4
    paddw       xmm3, xmm5
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    paddw       xmm3, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    psrlw       xmm3, 4
    packuswb    xmm2, xmm3
    movdqa      [rdx], xmm2
    add         rdx, rdi
    dec         rcx
    jnz         .combine
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp10_filter_by_weight8x8_sse2
 ;(
 ;    unsigned char *src,
 ;    int            src_stride,
 ;    unsigned char *dst,
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
 global sym(vp10_filter_by_weight8x8_sse2) PRIVATE
 sym(vp10_filter_by_weight8x8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight
    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride
    mov         rcx, 8                      ; loop count
    pxor        xmm4, xmm4
 .combine
    movq        xmm2, [rax]
    movq        xmm3, [rdx]
    add         rax, rsi
    ; src * src_weight
    punpcklbw   xmm2, xmm4
    pmullw      xmm2, xmm0
    ; dst * dst_weight
    punpcklbw   xmm3, xmm4
    pmullw      xmm3, xmm1
    ; sum, round and shift
    paddw       xmm2, xmm3
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    packuswb    xmm2, xmm4
    movq        [rdx], xmm2
    add         rdx, rdi
    dec         rcx
    jnz         .combine
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp10_variance_and_sad_16x16_sse2 | arg
 ;(
 ;    unsigned char *src1,          0
 ;    int            stride1,       1
 ;    unsigned char *src2,          2
 ;    int            stride2,       3
 ;    unsigned int  *variance,      4
 ;    unsigned int  *sad,           5
 ;)
 global sym(vp10_variance_and_sad_16x16_sse2) PRIVATE
 sym(vp10_variance_and_sad_16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    mov         rax,        arg(0)          ; src1
    mov         rcx,        arg(1)          ; stride1
    mov         rdx,        arg(2)          ; src2
    mov         rdi,        arg(3)          ; stride2
    mov         rsi,        16              ; block height
    ; Prep accumulator registers
    pxor        xmm3, xmm3                  ; SAD
    pxor        xmm4, xmm4                  ; sum of src2
    pxor        xmm5, xmm5                  ; sum of src2^2
    ; Because we're working with the actual output frames
    ; we can't depend on any kind of data alignment.
 .accumulate
    movdqa      xmm0, [rax]                 ; src1
    movdqa      xmm1, [rdx]                 ; src2
    add         rax, rcx                    ; src1 + stride1
    add         rdx, rdi                    ; src2 + stride2
    ; SAD(src1, src2)
    psadbw      xmm0, xmm1
    paddusw     xmm3, xmm0
    ; SUM(src2)
    pxor        xmm2, xmm2
    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
    paddusw     xmm4, xmm2
    ; pmaddubsw would be ideal if it took two unsigned values. instead,
    ; it expects a signed and an unsigned value. so instead we zero extend
    ; and operate on words.
    pxor        xmm2, xmm2
    movdqa      xmm0, xmm1
    punpcklbw   xmm0, xmm2
    punpckhbw   xmm1, xmm2
    pmaddwd     xmm0, xmm0
    pmaddwd     xmm1, xmm1
    paddd       xmm5, xmm0
    paddd       xmm5, xmm1
    sub         rsi,        1
    jnz         .accumulate
    ; phaddd only operates on adjacent double words.
    ; Finalize SAD and store
    movdqa      xmm0, xmm3
    psrldq      xmm0, 8
    paddusw     xmm0, xmm3
    paddd       xmm0, [GLOBAL(t128)]
    psrld       xmm0, 8
    mov         rax,  arg(5)
    movd        [rax], xmm0
    ; Accumulate sum of src2
    movdqa      xmm0, xmm4
    psrldq      xmm0, 8
    paddusw     xmm0, xmm4
    ; Square src2. Ignore high value
    pmuludq     xmm0, xmm0
    psrld       xmm0, 8
    ; phaddw could be used to sum adjacent values but we want
    ; all the values summed. promote to doubles, accumulate,
    ; shift and sum
    pxor        xmm2, xmm2
    movdqa      xmm1, xmm5
    punpckldq   xmm1, xmm2
    punpckhdq   xmm5, xmm2
    paddd       xmm1, xmm5
    movdqa      xmm2, xmm1
    psrldq      xmm1, 8
    paddd       xmm1, xmm2
    psubd       xmm1, xmm0
    ; (variance + 128) >> 8
    paddd       xmm1, [GLOBAL(t128)]
    psrld       xmm1, 8
    mov         rax,  arg(4)
    movd        [rax], xmm1
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 t128:
 %ifndef __NASM_VER__
    ddq 128
 %elif CONFIG_BIG_ENDIAN
    dq  0, 128
 %else
    dq  128, 0
 %endif
 align 16
 tMFQE: ; 1 << MFQE_PRECISION
    times 8 dw 0x10
 align 16
 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
    times 8 dw 0x08
--- a/vp10/common/x86/postproc_sse2.asm
+++ b/vp10/common/x86/postproc_sse2.asm
@ -1,694 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp10_post_proc_down_and_across_xmm
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned char *dst_ptr,
 ;    int src_pixels_per_line,
 ;    int dst_pixels_per_line,
 ;    int rows,
 ;    int cols,
 ;    int flimit
 ;)
 global sym(vp10_post_proc_down_and_across_xmm) PRIVATE
 sym(vp10_post_proc_down_and_across_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    ALIGN_STACK 16, rax
    ; move the global rd onto the stack, since we don't have enough registers
    ; to do PIC addressing
    movdqa      xmm0, [GLOBAL(rd42)]
    sub         rsp, 16
    movdqa      [rsp], xmm0
 %define RD42 [rsp]
 %else
 %define RD42 [GLOBAL(rd42)]
 %endif
        movd        xmm2,       dword ptr arg(6) ;flimit
        punpcklwd   xmm2,       xmm2
        punpckldq   xmm2,       xmm2
        punpcklqdq  xmm2,       xmm2
        mov         rsi,        arg(0) ;src_ptr
        mov         rdi,        arg(1) ;dst_ptr
        movsxd      rcx,        DWORD PTR arg(4) ;rows
        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
        pxor        xmm0,       xmm0              ; mm0 = 00000000
 .nextrow:
        xor         rdx,        rdx       ; clear out rdx for use as loop counter
 .nextcol:
        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
        psllw       xmm3,       2                       ;
        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm6
        ; thresholding
        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
        pcmpgtw     xmm7,       xmm2
        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds
        neg         rax
        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds
        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
        paddusw     xmm3,       xmm4                    ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds
        paddusw     xmm3,       RD42                    ; mm3 += round value
        psraw       xmm3,       3                       ; mm3 /= 8
        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7                    ; combination
        packuswb    xmm1,       xmm0                    ; pack to bytes
        movq        QWORD PTR [rdi], xmm1             ;
        neg         rax                   ; pitch is positive
        add         rsi,        8
        add         rdi,        8
        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols
        jl          .nextcol
        ; done with the all cols, start the across filtering in place
        sub         rsi,        rdx
        sub         rdi,        rdx
        xor         rdx,        rdx
        movq        mm0,        QWORD PTR [rdi-8];
 .acrossnextcol:
        movq        xmm7,       QWORD PTR [rdi +rdx -2]
        movd        xmm4,       DWORD PTR [rdi +rdx +6]
        pslldq      xmm4,       8
        por         xmm4,       xmm7
        movdqa      xmm3,       xmm4
        psrldq      xmm3,       2
        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
        psllw       xmm3,       2
        movdqa      xmm5,       xmm4
        psrldq      xmm5,       3
        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
        paddusw     xmm3,       xmm5              ; mm3 += mm6
        ; thresholding
        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm7,       xmm2
        movdqa      xmm5,       xmm4
        psrldq      xmm5,       4
        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
        paddusw     xmm3,       xmm5              ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds
        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
        paddusw     xmm3,       xmm5              ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds
        psrldq      xmm4,       1                   ; mm4 = p-1..p5
        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
        paddusw     xmm3,       xmm4              ; mm3 += mm5
        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds
        paddusw     xmm3,       RD42              ; mm3 += round value
        psraw       xmm3,       3                 ; mm3 /= 8
        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7              ; combination
        packuswb    xmm1,       xmm0              ; pack to bytes
        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
        movdq2q     mm0,        xmm1
        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols
        jl          .acrossnextcol;
        ; last 8 pixels
        movq        QWORD PTR [rdi+rdx-8],  mm0
        ; done with this rwo
        add         rsi,rax               ; next line
        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
        add         rdi,rax               ; next destination
        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
        dec         rcx                   ; decrement count
        jnz         .nextrow              ; next row
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    add rsp,16
    pop rsp
 %endif
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 %undef RD42
 ;void vp10_mbpost_proc_down_xmm(unsigned char *dst,
 ;                            int pitch, int rows, int cols,int flimit)
 extern sym(vp10_rv)
 global sym(vp10_mbpost_proc_down_xmm) PRIVATE
 sym(vp10_mbpost_proc_down_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 128+16
    ; unsigned char d[16][8] at [rsp]
    ; create flimit2 at [rsp+128]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp+128], eax
    mov         [rsp+128+4], eax
    mov         [rsp+128+8], eax
    mov         [rsp+128+12], eax
 %define flimit4 [rsp+128]
 %if ABI_IS_32BIT=0
    lea         r8,       [GLOBAL(sym(vp10_rv))]
 %endif
    ;rows +=8;
    add         dword arg(2), 8
    ;for(c=0; c<cols; c+=8)
 .loop_col:
            mov         rsi,        arg(0) ; s
            pxor        xmm0,       xmm0        ;
            movsxd      rax,        dword ptr arg(1) ;pitch       ;
            neg         rax                                     ; rax = -pitch
            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax
            pxor        xmm5,       xmm5
            pxor        xmm6,       xmm6        ;
            pxor        xmm7,       xmm7        ;
            mov         rdi,        rsi
            mov         rcx,        15          ;
 .loop_initvar:
            movq        xmm1,       QWORD PTR [rdi];
            punpcklbw   xmm1,       xmm0        ;
            paddw       xmm5,       xmm1        ;
            pmullw      xmm1,       xmm1        ;
            movdqa      xmm2,       xmm1        ;
            punpcklwd   xmm1,       xmm0        ;
            punpckhwd   xmm2,       xmm0        ;
            paddd       xmm6,       xmm1        ;
            paddd       xmm7,       xmm2        ;
            lea         rdi,        [rdi+rax]   ;
            dec         rcx
            jne         .loop_initvar
            ;save the var and sum
            xor         rdx,        rdx
 .loop_row:
            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
            punpcklbw   xmm1,       xmm0
            punpcklbw   xmm2,       xmm0
            paddw       xmm5,       xmm2
            psubw       xmm5,       xmm1
            pmullw      xmm2,       xmm2
            movdqa      xmm4,       xmm2
            punpcklwd   xmm2,       xmm0
            punpckhwd   xmm4,       xmm0
            paddd       xmm6,       xmm2
            paddd       xmm7,       xmm4
            pmullw      xmm1,       xmm1
            movdqa      xmm2,       xmm1
            punpcklwd   xmm1,       xmm0
            psubd       xmm6,       xmm1
            punpckhwd   xmm2,       xmm0
            psubd       xmm7,       xmm2
            movdqa      xmm3,       xmm6
            pslld       xmm3,       4
            psubd       xmm3,       xmm6
            movdqa      xmm1,       xmm5
            movdqa      xmm4,       xmm5
            pmullw      xmm1,       xmm1
            pmulhw      xmm4,       xmm4
            movdqa      xmm2,       xmm1
            punpcklwd   xmm1,       xmm4
            punpckhwd   xmm2,       xmm4
            movdqa      xmm4,       xmm7
            pslld       xmm4,       4
            psubd       xmm4,       xmm7
            psubd       xmm3,       xmm1
            psubd       xmm4,       xmm2
            psubd       xmm3,       flimit4
            psubd       xmm4,       flimit4
            psrad       xmm3,       31
            psrad       xmm4,       31
            packssdw    xmm3,       xmm4
            packsswb    xmm3,       xmm0
            movq        xmm1,       QWORD PTR [rsi+rax*8]
            movq        xmm2,       xmm1
            punpcklbw   xmm1,       xmm0
            paddw       xmm1,       xmm5
            mov         rcx,        rdx
            and         rcx,        127
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
            push        rax
            lea         rax,        [GLOBAL(sym(vp10_rv))]
            movdqu      xmm4,       [rax + rcx*2] ;vp10_rv[rcx*2]
            pop         rax
 %elif ABI_IS_32BIT=0
            movdqu      xmm4,       [r8 + rcx*2] ;vp10_rv[rcx*2]
 %else
            movdqu      xmm4,       [sym(vp10_rv) + rcx*2]
 %endif
            paddw       xmm1,       xmm4
            ;paddw     xmm1,       eight8s
            psraw       xmm1,       4
            packuswb    xmm1,       xmm0
            pand        xmm1,       xmm3
            pandn       xmm3,       xmm2
            por         xmm1,       xmm3
            and         rcx,        15
            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
            mov         rcx,        rdx
            sub         rcx,        8
            and         rcx,        15
            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
            movq        [rsi],      mm0
            lea         rsi,        [rsi+rax]
            lea         rdi,        [rdi+rax]
            add         rdx,        1
            cmp         edx,        dword arg(2) ;rows
            jl          .loop_row
        add         dword arg(0), 8 ; s += 8
        sub         dword arg(3), 8 ; cols -= 8
        cmp         dword arg(3), 0
        jg          .loop_col
    add         rsp, 128+16
    pop         rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 %undef flimit4
 ;void vp10_mbpost_proc_across_ip_xmm(unsigned char *src,
 ;                                int pitch, int rows, int cols,int flimit)
 global sym(vp10_mbpost_proc_across_ip_xmm) PRIVATE
 sym(vp10_mbpost_proc_across_ip_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 16
    ; create flimit4 at [rsp]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp], eax
    mov         [rsp+4], eax
    mov         [rsp+8], eax
    mov         [rsp+12], eax
 %define flimit4 [rsp]
    ;for(r=0;r<rows;r++)
 .ip_row_loop:
        xor         rdx,    rdx ;sumsq=0;
        xor         rcx,    rcx ;sum=0;
        mov         rsi,    arg(0); s
        mov         rdi,    -8
 .ip_var_loop:
        ;for(i=-8;i<=6;i++)
        ;{
        ;    sumsq += s[i]*s[i];
        ;    sum   += s[i];
        ;}
        movzx       eax, byte [rsi+rdi]
        add         ecx, eax
        mul         al
        add         edx, eax
        add         rdi, 1
        cmp         rdi, 6
        jle         .ip_var_loop
            ;mov         rax,    sumsq
            ;movd        xmm7,   rax
            movd        xmm7,   edx
            ;mov         rax,    sum
            ;movd        xmm6,   rax
            movd        xmm6,   ecx
            mov         rsi,    arg(0) ;s
            xor         rcx,    rcx
            movsxd      rdx,    dword arg(3) ;cols
            add         rdx,    8
            pxor        mm0,    mm0
            pxor        mm1,    mm1
            pxor        xmm0,   xmm0
 .nextcol4:
            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
            punpcklbw   xmm1,   xmm0                    ; expanding
            punpcklbw   xmm2,   xmm0                    ; expanding
            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
            paddd       xmm6,   xmm2
            paddd       xmm7,   xmm1
            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
            paddd       xmm6,   xmm4
            paddd       xmm7,   xmm3
            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4
            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4
            movdqa      xmm3,   xmm6
            pmaddwd     xmm3,   xmm3
            movdqa      xmm5,   xmm7
            pslld       xmm5,   4
            psubd       xmm5,   xmm7
            psubd       xmm5,   xmm3
            psubd       xmm5,   flimit4
            psrad       xmm5,   31
            packssdw    xmm5,   xmm0
            packsswb    xmm5,   xmm0
            movd        xmm1,   DWORD PTR [rsi+rcx]
            movq        xmm2,   xmm1
            punpcklbw   xmm1,   xmm0
            punpcklwd   xmm1,   xmm0
            paddd       xmm1,   xmm6
            paddd       xmm1,   [GLOBAL(four8s)]
            psrad       xmm1,   4
            packssdw    xmm1,   xmm0
            packuswb    xmm1,   xmm0
            pand        xmm1,   xmm5
            pandn       xmm5,   xmm2
            por         xmm5,   xmm1
            movd        [rsi+rcx-8],  mm0
            movq        mm0,    mm1
            movdq2q     mm1,    xmm5
            psrldq      xmm7,   12
            psrldq      xmm6,   12
            add         rcx,    4
            cmp         rcx,    rdx
            jl          .nextcol4
        ;s+=pitch;
        movsxd rax, dword arg(1)
        add    arg(0), rax
        sub dword arg(2), 1 ;rows-=1
        cmp dword arg(2), 0
        jg .ip_row_loop
    add         rsp, 16
    pop         rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 %undef flimit4
 ;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
 ;                            unsigned char blackclamp[16],
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int width, unsigned int height, int pitch)
 global sym(vp10_plane_add_noise_wmt) PRIVATE
 sym(vp10_plane_add_noise_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 8
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
 .addnoise_loop:
    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
    ; we rely on the fact that the clamping vectors are stored contiguously
    ; in black/white/both order. Note that we have to reload this here because
    ; rdx could be trashed by rand()
    mov     rdx, arg(2) ; blackclamp
            mov     rdi, rcx
            movsxd  rcx, dword arg(5) ;[Width]
            mov     rsi, arg(0) ;Pos
            xor         rax,rax
 .addnoise_nextset:
            movdqu      xmm1,[rsi+rax]         ; get the source
            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
            paddusb     xmm1, [rdx+32] ;bothclamp
            psubusb     xmm1, [rdx+16] ;whiteclamp
            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
            paddb       xmm1,xmm2              ; add it in
            movdqu      [rsi+rax],xmm1         ; store the result
            add         rax,16                 ; move to the next line
            cmp         rax, rcx
            jl          .addnoise_nextset
    movsxd  rax, dword arg(7) ; Pitch
    add     arg(0), rax ; Start += Pitch
    sub     dword arg(6), 1   ; Height -= 1
    jg      .addnoise_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 rd42:
    times 8 dw 0x04
 four8s:
    times 4 dd 8
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@ -26,9 +26,6 @@
 #include "vp10/common/alloccommon.h"
 #include "vp10/common/loopfilter.h"
 #include "vp10/common/onyxc_int.h"
 #if CONFIG_VP9_POSTPROC
 #include "vp10/common/postproc.h"
 #endif
 #include "vp10/common/quant_common.h"
 #include "vp10/common/reconintra.h"
@ -413,13 +410,9 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  return retcode;
 }
-int vp10_get_raw_frame(VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+int vp10_get_raw_frame(VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd) {
                      vp10_ppflags_t *flags) {
  VP10_COMMON *const cm = &pbi->common;
  int ret = -1;
 #if !CONFIG_VP9_POSTPROC
  (void)*flags;
 #endif
  if (pbi->ready_for_new_data == 1)
    return ret;
@ -432,17 +425,8 @@ int vp10_get_raw_frame(VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
  pbi->ready_for_new_data = 1;
 #if CONFIG_VP9_POSTPROC
  if (!cm->show_existing_frame) {
    ret = vp10_post_proc_frame(cm, sd, flags);
  } else {
  *sd = *cm->frame_to_show;
  ret = 0;
  }
 #else
  *sd = *cm->frame_to_show;
  ret = 0;
 #endif /*!CONFIG_POSTPROC*/
  vpx_clear_system_state();
  return ret;
 }
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@ -20,7 +20,6 @@
 #include "vp10/common/thread_common.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/ppflags.h"
 #include "vp10/decoder/dthread.h"
 #ifdef __cplusplus
@ -85,8 +84,7 @@ typedef struct VP10Decoder {
 int vp10_receive_compressed_data(struct VP10Decoder *pbi,
                                size_t size, const uint8_t **dest);
-int vp10_get_raw_frame(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+int vp10_get_raw_frame(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd);
                      vp10_ppflags_t *flags);
 vpx_codec_err_t vp10_copy_reference_dec(struct VP10Decoder *pbi,
                                       VP9_REFFRAME ref_frame_flag,
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -17,9 +17,6 @@
 #include "vp10/common/alloccommon.h"
 #include "vp10/common/filter.h"
 #include "vp10/common/idct.h"
 #if CONFIG_VP9_POSTPROC
 #include "vp10/common/postproc.h"
 #endif
 #include "vp10/common/reconinter.h"
 #include "vp10/common/reconintra.h"
 #include "vp10/common/tile_common.h"
@ -375,9 +372,6 @@ static void dealloc_compressor_data(VP10_COMP *cpi) {
  cpi->active_map.map = NULL;
  vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
  vp10_free_postproc_buffers(cm);
 #endif
  vp10_free_context_buffers(cm);
  vpx_free_frame_buffer(&cpi->last_frame_uf);
@ -1969,9 +1963,6 @@ void vp10_remove_compressor(VP10_COMP *cpi) {
  vp10_remove_common(cm);
  vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
  vp10_free_postproc_buffers(cm);
 #endif
  vpx_free(cpi);
 #if CONFIG_VP9_TEMPORAL_DENOISING
@ -2961,31 +2952,6 @@ static void set_size_dependent_vars(VP10_COMP *cpi, int *q,
  // lagged coding, and if the relevant speed feature flag is set.
  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
    configure_static_seg_features(cpi);
 #if CONFIG_VP9_POSTPROC
  if (oxcf->noise_sensitivity > 0) {
    int l = 0;
    switch (oxcf->noise_sensitivity) {
      case 1:
        l = 20;
        break;
      case 2:
        l = 40;
        break;
      case 3:
        l = 60;
        break;
      case 4:
      case 5:
        l = 100;
        break;
      case 6:
        l = 150;
        break;
    }
    vp10_denoise(cpi->Source, cpi->Source, l);
  }
 #endif  // CONFIG_VP9_POSTPROC
 }
 static void init_motion_estimation(VP10_COMP *cpi) {
@ -4169,22 +4135,6 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
        {
          PSNR_STATS psnr2;
          double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
                                     recon->y_crop_width, recon->y_crop_height,
                                     cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VPX_HIGHBITDEPTH
                                     cm->use_highbitdepth,
 #endif
                                     VPX_ENC_BORDER_IN_PIXELS,
                                     cm->byte_alignment) < 0) {
            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                               "Failed to allocate post processing buffer");
          }
          vp10_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                      cm->lf.filter_level * 10 / 6);
 #endif
          vpx_clear_system_state();
 #if CONFIG_VPX_HIGHBITDEPTH
@ -4315,20 +4265,13 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
  return 0;
 }
-int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
                              vp10_ppflags_t *flags) {
  VP10_COMMON *cm = &cpi->common;
 #if !CONFIG_VP9_POSTPROC
  (void)flags;
 #endif
  if (!cm->show_frame) {
    return -1;
  } else {
    int ret;
 #if CONFIG_VP9_POSTPROC
    ret = vp10_post_proc_frame(cm, dest, flags);
 #else
    if (cm->frame_to_show) {
      *dest = *cm->frame_to_show;
      dest->y_width = cm->width;
@ -4339,7 +4282,6 @@ int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
    } else {
      ret = -1;
    }
 #endif  // !CONFIG_VP9_POSTPROC
    vpx_clear_system_state();
    return ret;
  }
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@ -17,7 +17,6 @@
 #include "vpx/vp8cx.h"
 #include "vp10/common/alloccommon.h"
 #include "vp10/common/ppflags.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/thread_common.h"
 #include "vp10/common/onyxc_int.h"
@ -514,8 +513,7 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
                            size_t *size, uint8_t *dest,
                            int64_t *time_stamp, int64_t *time_end, int flush);
-int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest);
                              vp10_ppflags_t *flags);
 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags);
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@ -313,7 +313,7 @@ static void temporal_filter_iterate_c(VP10_COMP *cpi,
  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
    // Source frames are extended to 16 pixels. This is different than
-    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+    //  L/A/G reference frames that have a border of 32 (VPXENCBORDERINPIXELS)
    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
    //  before and 3 pixels after.  So the largest Y mv on a border would
    //  then be 16 - VPX_INTERP_EXTEND. The UV blocks are half the size of the
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@ -10,7 +10,6 @@
 VP10_COMMON_SRCS-yes += vp10_common.mk
 VP10_COMMON_SRCS-yes += vp10_iface_common.h
 VP10_COMMON_SRCS-yes += common/ppflags.h
 VP10_COMMON_SRCS-yes += common/alloccommon.c
 VP10_COMMON_SRCS-yes += common/blockd.c
 VP10_COMMON_SRCS-yes += common/debugmodes.c
@ -64,15 +63,6 @@ VP10_COMMON_SRCS-yes += common/scan.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
 ifneq ($(CONFIG_VPX_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans8_dspr2.c
@ -84,10 +74,6 @@ VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct4x4_msa.c
 VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c
 VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@ -1121,34 +1121,16 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
 #if CONFIG_VP9_POSTPROC
  vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *);
  if (config != NULL) {
    ctx->preview_ppcfg = *config;
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
  }
 #else
  (void)ctx;
  (void)args;
  return VPX_CODEC_INCAPABLE;
 #endif
 }
 static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
  YV12_BUFFER_CONFIG sd;
  vp10_ppflags_t flags;
  vp10_zero(flags);
-  if (ctx->preview_ppcfg.post_proc_flag) {
+  if (vp10_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
    flags.post_proc_flag   = ctx->preview_ppcfg.post_proc_flag;
    flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
    flags.noise_level      = ctx->preview_ppcfg.noise_level;
  }
  if (vp10_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
    yuvconfig2image(&ctx->preview_img, &sd, NULL);
    return &ctx->preview_img;
  } else {
--- a/vp10/vp10_dx_iface.c
+++ b/vp10/vp10_dx_iface.c
@ -29,7 +29,7 @@
 #include "vp10/vp10_iface_common.h"
-#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP9_CAP_POSTPROC 0
 typedef vpx_codec_stream_info_t vp10_stream_info_t;
@ -119,9 +119,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
          (FrameWorkerData *)worker->data1;
      vpx_get_worker_interface()->end(worker);
      vp10_remove_common(&frame_worker_data->pbi->common);
 #if CONFIG_VP9_POSTPROC
      vp10_free_postproc_buffers(&frame_worker_data->pbi->common);
 #endif
      vp10_decoder_remove(frame_worker_data->pbi);
      vpx_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD
@ -313,15 +310,6 @@ static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
  cfg->noise_level = 0;
 }
 static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
                        vp10_ppflags_t *flags) {
  flags->post_proc_flag =
      ctx->postproc_cfg.post_proc_flag;
  flags->deblocking_level = ctx->postproc_cfg.deblocking_level;
  flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 static int frame_worker_hook(void *arg1, void *arg2) {
  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
  const uint8_t *data = frame_worker_data->data;
@ -554,7 +542,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
 static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
  YV12_BUFFER_CONFIG sd;
  vp10_ppflags_t flags = {0, 0, 0};
  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
@ -567,7 +554,7 @@ static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
  check_resync(ctx, frame_worker_data->pbi);
-  if (vp10_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+  if (vp10_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
    VP10_COMMON *const cm = &frame_worker_data->pbi->common;
    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
@ -746,7 +733,6 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
  if (*iter == NULL && ctx->frame_workers != NULL) {
    do {
      YV12_BUFFER_CONFIG sd;
      vp10_ppflags_t flags = {0, 0, 0};
      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
      VPxWorker *const worker =
          &ctx->frame_workers[ctx->next_output_worker_id];
@ -754,8 +740,6 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
          (FrameWorkerData *)worker->data1;
      ctx->next_output_worker_id =
          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
        set_ppflags(ctx, &flags);
      // Wait for the frame from worker thread.
      if (winterface->sync(worker)) {
        // Check if worker has received any frames.
@ -764,7 +748,7 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
          frame_worker_data->received_frame = 0;
          check_resync(ctx, frame_worker_data->pbi);
        }
-        if (vp10_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+        if (vp10_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
          VP10_COMMON *const cm = &frame_worker_data->pbi->common;
          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
          release_last_output_frame(ctx);
@ -878,21 +862,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
 #if CONFIG_VP9_POSTPROC
  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
  if (data) {
    ctx->postproc_cfg_set = 1;
    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
  }
 #else
  (void)ctx;
  (void)args;
  return VPX_CODEC_INCAPABLE;
 #endif
 }
 static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@ -77,10 +77,6 @@ VP10_CX_SRCS-yes += encoder/aq_complexity.c
 VP10_CX_SRCS-yes += encoder/aq_complexity.h
 VP10_CX_SRCS-yes += encoder/skin_detection.c
 VP10_CX_SRCS-yes += encoder/skin_detection.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 endif
 VP10_CX_SRCS-yes += encoder/temporal_filter.c
 VP10_CX_SRCS-yes += encoder/temporal_filter.h
 VP10_CX_SRCS-yes += encoder/mbgraph.c
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@ -393,7 +393,7 @@ section .text
 ; On Android platforms use lrand48 when building postproc routines. Prior to L
 ; rand() was not available.
-%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
+%if CONFIG_POSTPROC=1
 %ifdef __ANDROID__
 extern sym(lrand48)
 %define LIBVPX_RAND lrand48