Move sub pixel variance to vpx_dsp

Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1
2015-06-05 09:54:19 -07:00 · 2015-06-05 09:54:19 -07:00 · 6a82f0d7fb
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@ -1,137 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vp8/common/variance.h"
-#include "vp8/common/filter.h"
-
-// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
-#if CONFIG_VP8_ENCODER
-
-#if HAVE_MEDIA
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-unsigned int vp8_sub_pixel_variance8x8_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[10*8];
-    unsigned char  second_pass[8*8];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            9, 8, HFilter);
-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             8, 8, 8, VFilter);
-
-    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
-                                 dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[36*16];
-    unsigned char  second_pass[20*16];
-    const short *HFilter, *VFilter;
-    unsigned int var;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else
-    {
-        HFilter = vp8_bilinear_filters[xoffset];
-        VFilter = vp8_bilinear_filters[yoffset];
-
-        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                                src_pixels_per_line,
-                                                17, 16, HFilter);
-        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                                 16, 16, 16, VFilter);
-
-        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
-                                      dst_pixels_per_line, sse);
-    }
-    return var;
-}
-
-#endif  // HAVE_MEDIA
-
-
-#if HAVE_NEON
-
-extern unsigned int vp8_sub_pixel_variance16x16_neon_func
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-
-unsigned int vp8_sub_pixel_variance16x16_neon
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-  if (xoffset == 4 && yoffset == 0)
-    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 0 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 4 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else
-    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif  // HAVE_NEON
-#endif  // CONFIG_VP8_ENCODER
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@ -20,7 +20,7 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp8/common/postproc.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"

--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@ -237,47 +237,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
 specialize qw/vp8_bilinear_predict4x4 mmx media/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;

-#
-# Sub-pixel Variance
-#
-add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
-$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;
-$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
-$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
-$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
-$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
-$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
-$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
-$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
-$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
-$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
-$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-
 #
 # Encoder functions below this point.
 #
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h
@ -1,92 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_COMMON_VARIANCE_H_
-#define VP8_COMMON_VARIANCE_H_
-
-#include "vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vpx_sad_fn_t)(
-    const uint8_t *src_ptr,
-    int source_stride,
-    const uint8_t *ref_ptr,
-    int ref_stride);
-
-typedef void (*vp8_copy32xn_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    unsigned char *ref_ptr,
-    int ref_stride,
-    int n);
-
-typedef void (*vpx_sad_multi_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_array,
-    int  ref_stride,
-    unsigned int *sad_array);
-
-typedef void (*vpx_sad_multi_d_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char * const ref_array[],
-     int  ref_stride,
-     unsigned int *sad_array
-    );
-
-typedef unsigned int (*vpx_variance_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride,
-     unsigned int *sse
-    );
-
-typedef unsigned int (*vp8_subpixvariance_fn_t)
-    (
-      const unsigned char  *src_ptr,
-      int  source_stride,
-      int  xoffset,
-      int  yoffset,
-      const unsigned char *ref_ptr,
-      int Refstride,
-      unsigned int *sse
-    );
-
-typedef struct variance_vtable
-{
-    vpx_sad_fn_t            sdf;
-    vpx_variance_fn_t       vf;
-    vp8_subpixvariance_fn_t svf;
-    vpx_variance_fn_t       svf_halfpix_h;
-    vpx_variance_fn_t       svf_halfpix_v;
-    vpx_variance_fn_t       svf_halfpix_hv;
-    vpx_sad_multi_fn_t      sdx3f;
-    vpx_sad_multi_fn_t      sdx8f;
-    vpx_sad_multi_d_fn_t    sdx4df;
-#if ARCH_X86 || ARCH_X86_64
-    vp8_copy32xn_fn_t       copymem;
-#endif
-} vp8_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_COMMON_VARIANCE_H_
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c
@ -1,337 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "filter.h"
-#include "variance.h"
-
-/* This is a bad idea.
- * ctz = count trailing zeros */
-static int ctz(int a) {
-  int b = 0;
-  while (a != 1) {
-    a >>= 1;
-    b++;
-  }
-  return b;
-}
-
-static unsigned int variance(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    int  w,
-    int  h,
-    unsigned int *sse)
-{
-    int i, j;
-    int diff, sum;
-
-    sum = 0;
-    *sse = 0;
-
-    for (i = 0; i < h; i++)
-    {
-        for (j = 0; j < w; j++)
-        {
-            diff = src_ptr[j] - ref_ptr[j];
-            sum += diff;
-            *sse += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-
-    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass
-(
-    const unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply filter */
-            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-
-unsigned int vp8_sub_pixel_variance4x4_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    /* First filter 1d Horizontal */
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
-    /* Now filter Verticaly */
-    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
-
-    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance16x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance8x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
-}
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
@ -1,972 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-;void vp8_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp8_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-vp8_half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-vp8_half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-vp8_half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             vp8_half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance8x_h_1        ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-vp8_half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x_h_1        ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vp8/common/x86/variance_impl_ssse3.asm
+++ b/vp8/common/x86/variance_impl_ssse3.asm
@ -1,364 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp8_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp8_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 112, 16
-    times 8 db 96,  32
-    times 8 db 80,  48
-    times 8 db 64,  64
-    times 8 db 48,  80
-    times 8 db 32,  96
-    times 8 db 16,  112
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@ -1,157 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-
-extern void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_ssse3
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance16x16_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
--- a/vp8/common/x86/vp8_variance_impl_mmx.asm
+++ b/vp8/common/x86/vp8_variance_impl_mmx.asm
@ -1,353 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift            7
-
-;void vp8_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp8_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
--- a/vp8/common/x86/vp8_variance_mmx.c
+++ b/vp8/common/x86/vp8_variance_mmx.c
@ -1,244 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-extern void filter_block1d_v6_mmx
-(
-    const short *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse)
-
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp8_sub_pixel_variance16x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
-                                           ref_ptr, recon_stride, sse);
-}
--- a/vp8/common/x86/vp8_variance_sse2.c
+++ b/vp8/common/x86/vp8_variance_sse2.c
@ -1,403 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-void vp8_filter_block2d_bil_var_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0
-        );
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum1, &xxsum1
-        );
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum1, &xxsum1);
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -16,7 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodeintra.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@ -13,7 +13,7 @@
 #define VP8_ENCODER_MCOMP_H_

 #include "block.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"

 #ifdef __cplusplus
 extern "C" {
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -2132,17 +2132,17 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
+    cpi->fn_ptr[BLOCK_16X16].svf            = vpx_sub_pixel_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vpx_variance_halfpixvar16x16_h;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vpx_variance_halfpixvar16x16_v;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;
    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;
    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;

    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
-    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf            = vpx_sub_pixel_variance16x8;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
@ -2152,7 +2152,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
-    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf            = vpx_sub_pixel_variance8x16;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
@ -2162,7 +2162,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
-    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf            = vpx_sub_pixel_variance8x8;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
@ -2172,7 +2172,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
-    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf            = vpx_sub_pixel_variance4x4;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -18,7 +18,7 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/entropy.h"
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@ -22,7 +22,7 @@
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -29,7 +29,7 @@
 #include "vp8/common/quant_common.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@ -500,9 +500,9 @@ int VP8_UVSSE(MACROBLOCK *x)

    if ((mv_row | mv_col) & 7)
    {
-        vp8_sub_pixel_variance8x8(uptr, pre_stride,
+        vpx_sub_pixel_variance8x8(uptr, pre_stride,
            mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
-        vp8_sub_pixel_variance8x8(vptr, pre_stride,
+        vpx_sub_pixel_variance8x8(vptr, pre_stride,
            mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
        sse2 += sse1;
    }
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@ -63,8 +63,6 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/variance_c.c
-VP8_COMMON_SRCS-yes += common/variance.h
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h


@ -86,8 +84,6 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
@ -96,12 +92,8 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm

 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@ -129,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c

 # common (media)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
@ -149,9 +140,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)

 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
@ -170,6 +158,5 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c

 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
                           uint8_t *dst, int32_t dst_stride, int32_t height) {
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c
@ -9,7 +9,7 @@
 */

 #include <string.h>
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
                            uint8_t *dst, int32_t dst_stride, int32_t height) {
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/vp9/common/mips/msa/vp9_convolve_msa.h
@ -12,7 +12,7 @@
 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_

 #include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 extern const uint8_t mc_filt_mask_arr[16 * 3];

--- a/vp9/common/mips/msa/vp9_idct_msa.h
+++ b/vp9/common/mips/msa/vp9_idct_msa.h
@ -13,7 +13,7 @@

 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \
  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
--- a/vp9/common/mips/msa/vp9_intra_predict_msa.c
+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c
@ -9,7 +9,7 @@
 */

 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
  out0 = __msa_subs_u_h(out0, in0);                \
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vp9/common/mips/msa/vp9_loopfilter_msa.h
@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
 #define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_

-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
                           p1_out, p0_out, q0_out, q1_out) {             \
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h
--- a/vp9/common/mips/msa/vp9_mfqe_msa.c
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c
@ -10,7 +10,7 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
                                    uint8_t *dst_ptr, int32_t dst_stride,
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -802,88 +802,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

-
-# variance
-add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon msa/;

@ -1085,241 +1003,6 @@ specialize qw/vp9_temporal_filter_apply sse2 msa/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
-
-
  # ENCODEMB INVOKE

  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
--- a/vp9/encoder/mips/msa/vp9_avg_msa.c
+++ b/vp9/encoder/mips/msa/vp9_avg_msa.c
@ -9,7 +9,7 @@
 */

 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
  uint32_t sum_out;
--- a/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/vp9/encoder/mips/msa/vp9_error_msa.c
@ -9,7 +9,7 @@
 */

 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \
 static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@ -13,7 +13,7 @@

 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \
  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
--- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
@ -9,7 +9,7 @@
 */

 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"

 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
                                            uint32_t stride,
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -1023,8 +1023,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x16_bits8,
                   vpx_highbd_sad32x16_avg_bits8,
                   vpx_highbd_8_variance32x16,
-                   vp9_highbd_sub_pixel_variance32x16,
-                   vp9_highbd_sub_pixel_avg_variance32x16,
+                   vpx_highbd_8_sub_pixel_variance32x16,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x16x4d_bits8)
@ -1033,8 +1033,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x32_bits8,
                   vpx_highbd_sad16x32_avg_bits8,
                   vpx_highbd_8_variance16x32,
-                   vp9_highbd_sub_pixel_variance16x32,
-                   vp9_highbd_sub_pixel_avg_variance16x32,
+                   vpx_highbd_8_sub_pixel_variance16x32,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad16x32x4d_bits8)
@ -1043,8 +1043,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x32_bits8,
                   vpx_highbd_sad64x32_avg_bits8,
                   vpx_highbd_8_variance64x32,
-                   vp9_highbd_sub_pixel_variance64x32,
-                   vp9_highbd_sub_pixel_avg_variance64x32,
+                   vpx_highbd_8_sub_pixel_variance64x32,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad64x32x4d_bits8)
@ -1053,8 +1053,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x64_bits8,
                   vpx_highbd_sad32x64_avg_bits8,
                   vpx_highbd_8_variance32x64,
-                   vp9_highbd_sub_pixel_variance32x64,
-                   vp9_highbd_sub_pixel_avg_variance32x64,
+                   vpx_highbd_8_sub_pixel_variance32x64,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x64x4d_bits8)
@ -1063,8 +1063,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x32_bits8,
                   vpx_highbd_sad32x32_avg_bits8,
                   vpx_highbd_8_variance32x32,
-                   vp9_highbd_sub_pixel_variance32x32,
-                   vp9_highbd_sub_pixel_avg_variance32x32,
+                   vpx_highbd_8_sub_pixel_variance32x32,
+                   vpx_highbd_8_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits8,
                   vpx_highbd_sad32x32x8_bits8,
                   vpx_highbd_sad32x32x4d_bits8)
@ -1073,8 +1073,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x64_bits8,
                   vpx_highbd_sad64x64_avg_bits8,
                   vpx_highbd_8_variance64x64,
-                   vp9_highbd_sub_pixel_variance64x64,
-                   vp9_highbd_sub_pixel_avg_variance64x64,
+                   vpx_highbd_8_sub_pixel_variance64x64,
+                   vpx_highbd_8_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits8,
                   vpx_highbd_sad64x64x8_bits8,
                   vpx_highbd_sad64x64x4d_bits8)
@ -1083,8 +1083,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x16_bits8,
                   vpx_highbd_sad16x16_avg_bits8,
                   vpx_highbd_8_variance16x16,
-                   vp9_highbd_sub_pixel_variance16x16,
-                   vp9_highbd_sub_pixel_avg_variance16x16,
+                   vpx_highbd_8_sub_pixel_variance16x16,
+                   vpx_highbd_8_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits8,
                   vpx_highbd_sad16x16x8_bits8,
                   vpx_highbd_sad16x16x4d_bits8)
@ -1093,8 +1093,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x8_bits8,
                   vpx_highbd_sad16x8_avg_bits8,
                   vpx_highbd_8_variance16x8,
-                   vp9_highbd_sub_pixel_variance16x8,
-                   vp9_highbd_sub_pixel_avg_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits8,
                   vpx_highbd_sad16x8x8_bits8,
                   vpx_highbd_sad16x8x4d_bits8)
@ -1103,8 +1103,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x16_bits8,
                   vpx_highbd_sad8x16_avg_bits8,
                   vpx_highbd_8_variance8x16,
-                   vp9_highbd_sub_pixel_variance8x16,
-                   vp9_highbd_sub_pixel_avg_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits8,
                   vpx_highbd_sad8x16x8_bits8,
                   vpx_highbd_sad8x16x4d_bits8)
@ -1113,8 +1113,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x8_bits8,
                   vpx_highbd_sad8x8_avg_bits8,
                   vpx_highbd_8_variance8x8,
-                   vp9_highbd_sub_pixel_variance8x8,
-                   vp9_highbd_sub_pixel_avg_variance8x8,
+                   vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits8,
                   vpx_highbd_sad8x8x8_bits8,
                   vpx_highbd_sad8x8x4d_bits8)
@ -1123,8 +1123,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x4_bits8,
                   vpx_highbd_sad8x4_avg_bits8,
                   vpx_highbd_8_variance8x4,
-                   vp9_highbd_sub_pixel_variance8x4,
-                   vp9_highbd_sub_pixel_avg_variance8x4,
+                   vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
                   NULL,
                   vpx_highbd_sad8x4x8_bits8,
                   vpx_highbd_sad8x4x4d_bits8)
@ -1133,8 +1133,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x8_bits8,
                   vpx_highbd_sad4x8_avg_bits8,
                   vpx_highbd_8_variance4x8,
-                   vp9_highbd_sub_pixel_variance4x8,
-                   vp9_highbd_sub_pixel_avg_variance4x8,
+                   vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
                   NULL,
                   vpx_highbd_sad4x8x8_bits8,
                   vpx_highbd_sad4x8x4d_bits8)
@ -1143,8 +1143,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x4_bits8,
                   vpx_highbd_sad4x4_avg_bits8,
                   vpx_highbd_8_variance4x4,
-                   vp9_highbd_sub_pixel_variance4x4,
-                   vp9_highbd_sub_pixel_avg_variance4x4,
+                   vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits8,
                   vpx_highbd_sad4x4x8_bits8,
                   vpx_highbd_sad4x4x4d_bits8)
@ -1155,8 +1155,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x16_bits10,
                   vpx_highbd_sad32x16_avg_bits10,
                   vpx_highbd_10_variance32x16,
-                   vp9_highbd_10_sub_pixel_variance32x16,
-                   vp9_highbd_10_sub_pixel_avg_variance32x16,
+                   vpx_highbd_10_sub_pixel_variance32x16,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x16x4d_bits10)
@ -1165,8 +1165,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x32_bits10,
                   vpx_highbd_sad16x32_avg_bits10,
                   vpx_highbd_10_variance16x32,
-                   vp9_highbd_10_sub_pixel_variance16x32,
-                   vp9_highbd_10_sub_pixel_avg_variance16x32,
+                   vpx_highbd_10_sub_pixel_variance16x32,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad16x32x4d_bits10)
@ -1175,8 +1175,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x32_bits10,
                   vpx_highbd_sad64x32_avg_bits10,
                   vpx_highbd_10_variance64x32,
-                   vp9_highbd_10_sub_pixel_variance64x32,
-                   vp9_highbd_10_sub_pixel_avg_variance64x32,
+                   vpx_highbd_10_sub_pixel_variance64x32,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad64x32x4d_bits10)
@ -1185,8 +1185,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x64_bits10,
                   vpx_highbd_sad32x64_avg_bits10,
                   vpx_highbd_10_variance32x64,
-                   vp9_highbd_10_sub_pixel_variance32x64,
-                   vp9_highbd_10_sub_pixel_avg_variance32x64,
+                   vpx_highbd_10_sub_pixel_variance32x64,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x64x4d_bits10)
@ -1195,8 +1195,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x32_bits10,
                   vpx_highbd_sad32x32_avg_bits10,
                   vpx_highbd_10_variance32x32,
-                   vp9_highbd_10_sub_pixel_variance32x32,
-                   vp9_highbd_10_sub_pixel_avg_variance32x32,
+                   vpx_highbd_10_sub_pixel_variance32x32,
+                   vpx_highbd_10_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits10,
                   vpx_highbd_sad32x32x8_bits10,
                   vpx_highbd_sad32x32x4d_bits10)
@ -1205,8 +1205,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x64_bits10,
                   vpx_highbd_sad64x64_avg_bits10,
                   vpx_highbd_10_variance64x64,
-                   vp9_highbd_10_sub_pixel_variance64x64,
-                   vp9_highbd_10_sub_pixel_avg_variance64x64,
+                   vpx_highbd_10_sub_pixel_variance64x64,
+                   vpx_highbd_10_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits10,
                   vpx_highbd_sad64x64x8_bits10,
                   vpx_highbd_sad64x64x4d_bits10)
@ -1215,8 +1215,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x16_bits10,
                   vpx_highbd_sad16x16_avg_bits10,
                   vpx_highbd_10_variance16x16,
-                   vp9_highbd_10_sub_pixel_variance16x16,
-                   vp9_highbd_10_sub_pixel_avg_variance16x16,
+                   vpx_highbd_10_sub_pixel_variance16x16,
+                   vpx_highbd_10_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits10,
                   vpx_highbd_sad16x16x8_bits10,
                   vpx_highbd_sad16x16x4d_bits10)
@ -1225,8 +1225,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x8_bits10,
                   vpx_highbd_sad16x8_avg_bits10,
                   vpx_highbd_10_variance16x8,
-                   vp9_highbd_10_sub_pixel_variance16x8,
-                   vp9_highbd_10_sub_pixel_avg_variance16x8,
+                   vpx_highbd_10_sub_pixel_variance16x8,
+                   vpx_highbd_10_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits10,
                   vpx_highbd_sad16x8x8_bits10,
                   vpx_highbd_sad16x8x4d_bits10)
@ -1235,8 +1235,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x16_bits10,
                   vpx_highbd_sad8x16_avg_bits10,
                   vpx_highbd_10_variance8x16,
-                   vp9_highbd_10_sub_pixel_variance8x16,
-                   vp9_highbd_10_sub_pixel_avg_variance8x16,
+                   vpx_highbd_10_sub_pixel_variance8x16,
+                   vpx_highbd_10_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits10,
                   vpx_highbd_sad8x16x8_bits10,
                   vpx_highbd_sad8x16x4d_bits10)
@ -1245,8 +1245,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x8_bits10,
                   vpx_highbd_sad8x8_avg_bits10,
                   vpx_highbd_10_variance8x8,
-                   vp9_highbd_10_sub_pixel_variance8x8,
-                   vp9_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits10,
                   vpx_highbd_sad8x8x8_bits10,
                   vpx_highbd_sad8x8x4d_bits10)
@ -1255,8 +1255,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x4_bits10,
                   vpx_highbd_sad8x4_avg_bits10,
                   vpx_highbd_10_variance8x4,
-                   vp9_highbd_10_sub_pixel_variance8x4,
-                   vp9_highbd_10_sub_pixel_avg_variance8x4,
+                   vpx_highbd_10_sub_pixel_variance8x4,
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
                   NULL,
                   vpx_highbd_sad8x4x8_bits10,
                   vpx_highbd_sad8x4x4d_bits10)
@ -1265,8 +1265,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x8_bits10,
                   vpx_highbd_sad4x8_avg_bits10,
                   vpx_highbd_10_variance4x8,
-                   vp9_highbd_10_sub_pixel_variance4x8,
-                   vp9_highbd_10_sub_pixel_avg_variance4x8,
+                   vpx_highbd_10_sub_pixel_variance4x8,
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
                   NULL,
                   vpx_highbd_sad4x8x8_bits10,
                   vpx_highbd_sad4x8x4d_bits10)
@ -1275,8 +1275,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x4_bits10,
                   vpx_highbd_sad4x4_avg_bits10,
                   vpx_highbd_10_variance4x4,
-                   vp9_highbd_10_sub_pixel_variance4x4,
-                   vp9_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits10,
                   vpx_highbd_sad4x4x8_bits10,
                   vpx_highbd_sad4x4x4d_bits10)
@ -1287,8 +1287,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x16_bits12,
                   vpx_highbd_sad32x16_avg_bits12,
                   vpx_highbd_12_variance32x16,
-                   vp9_highbd_12_sub_pixel_variance32x16,
-                   vp9_highbd_12_sub_pixel_avg_variance32x16,
+                   vpx_highbd_12_sub_pixel_variance32x16,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x16x4d_bits12)
@ -1297,8 +1297,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x32_bits12,
                   vpx_highbd_sad16x32_avg_bits12,
                   vpx_highbd_12_variance16x32,
-                   vp9_highbd_12_sub_pixel_variance16x32,
-                   vp9_highbd_12_sub_pixel_avg_variance16x32,
+                   vpx_highbd_12_sub_pixel_variance16x32,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad16x32x4d_bits12)
@ -1307,8 +1307,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x32_bits12,
                   vpx_highbd_sad64x32_avg_bits12,
                   vpx_highbd_12_variance64x32,
-                   vp9_highbd_12_sub_pixel_variance64x32,
-                   vp9_highbd_12_sub_pixel_avg_variance64x32,
+                   vpx_highbd_12_sub_pixel_variance64x32,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
                   NULL,
                   NULL,
                   vpx_highbd_sad64x32x4d_bits12)
@ -1317,8 +1317,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x64_bits12,
                   vpx_highbd_sad32x64_avg_bits12,
                   vpx_highbd_12_variance32x64,
-                   vp9_highbd_12_sub_pixel_variance32x64,
-                   vp9_highbd_12_sub_pixel_avg_variance32x64,
+                   vpx_highbd_12_sub_pixel_variance32x64,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
                   NULL,
                   NULL,
                   vpx_highbd_sad32x64x4d_bits12)
@ -1327,8 +1327,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad32x32_bits12,
                   vpx_highbd_sad32x32_avg_bits12,
                   vpx_highbd_12_variance32x32,
-                   vp9_highbd_12_sub_pixel_variance32x32,
-                   vp9_highbd_12_sub_pixel_avg_variance32x32,
+                   vpx_highbd_12_sub_pixel_variance32x32,
+                   vpx_highbd_12_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits12,
                   vpx_highbd_sad32x32x8_bits12,
                   vpx_highbd_sad32x32x4d_bits12)
@ -1337,8 +1337,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad64x64_bits12,
                   vpx_highbd_sad64x64_avg_bits12,
                   vpx_highbd_12_variance64x64,
-                   vp9_highbd_12_sub_pixel_variance64x64,
-                   vp9_highbd_12_sub_pixel_avg_variance64x64,
+                   vpx_highbd_12_sub_pixel_variance64x64,
+                   vpx_highbd_12_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits12,
                   vpx_highbd_sad64x64x8_bits12,
                   vpx_highbd_sad64x64x4d_bits12)
@ -1347,8 +1347,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x16_bits12,
                   vpx_highbd_sad16x16_avg_bits12,
                   vpx_highbd_12_variance16x16,
-                   vp9_highbd_12_sub_pixel_variance16x16,
-                   vp9_highbd_12_sub_pixel_avg_variance16x16,
+                   vpx_highbd_12_sub_pixel_variance16x16,
+                   vpx_highbd_12_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits12,
                   vpx_highbd_sad16x16x8_bits12,
                   vpx_highbd_sad16x16x4d_bits12)
@ -1357,8 +1357,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad16x8_bits12,
                   vpx_highbd_sad16x8_avg_bits12,
                   vpx_highbd_12_variance16x8,
-                   vp9_highbd_12_sub_pixel_variance16x8,
-                   vp9_highbd_12_sub_pixel_avg_variance16x8,
+                   vpx_highbd_12_sub_pixel_variance16x8,
+                   vpx_highbd_12_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits12,
                   vpx_highbd_sad16x8x8_bits12,
                   vpx_highbd_sad16x8x4d_bits12)
@ -1367,8 +1367,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x16_bits12,
                   vpx_highbd_sad8x16_avg_bits12,
                   vpx_highbd_12_variance8x16,
-                   vp9_highbd_12_sub_pixel_variance8x16,
-                   vp9_highbd_12_sub_pixel_avg_variance8x16,
+                   vpx_highbd_12_sub_pixel_variance8x16,
+                   vpx_highbd_12_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits12,
                   vpx_highbd_sad8x16x8_bits12,
                   vpx_highbd_sad8x16x4d_bits12)
@ -1377,8 +1377,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x8_bits12,
                   vpx_highbd_sad8x8_avg_bits12,
                   vpx_highbd_12_variance8x8,
-                   vp9_highbd_12_sub_pixel_variance8x8,
-                   vp9_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits12,
                   vpx_highbd_sad8x8x8_bits12,
                   vpx_highbd_sad8x8x4d_bits12)
@ -1387,8 +1387,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad8x4_bits12,
                   vpx_highbd_sad8x4_avg_bits12,
                   vpx_highbd_12_variance8x4,
-                   vp9_highbd_12_sub_pixel_variance8x4,
-                   vp9_highbd_12_sub_pixel_avg_variance8x4,
+                   vpx_highbd_12_sub_pixel_variance8x4,
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
                   NULL,
                   vpx_highbd_sad8x4x8_bits12,
                   vpx_highbd_sad8x4x4d_bits12)
@ -1397,8 +1397,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x8_bits12,
                   vpx_highbd_sad4x8_avg_bits12,
                   vpx_highbd_12_variance4x8,
-                   vp9_highbd_12_sub_pixel_variance4x8,
-                   vp9_highbd_12_sub_pixel_avg_variance4x8,
+                   vpx_highbd_12_sub_pixel_variance4x8,
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
                   NULL,
                   vpx_highbd_sad4x8x8_bits12,
                   vpx_highbd_sad4x8x4d_bits12)
@ -1407,8 +1407,8 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
                   vpx_highbd_sad4x4_bits12,
                   vpx_highbd_sad4x4_avg_bits12,
                   vpx_highbd_12_variance4x4,
-                   vp9_highbd_12_sub_pixel_variance4x4,
-                   vp9_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits12,
                   vpx_highbd_sad4x4x8_bits12,
                   vpx_highbd_sad4x4x4d_bits12)
@ -1832,62 +1832,62 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
    cpi->fn_ptr[BT].sdx4df         = SDX4DF;

  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
-      vpx_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)

  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
-      vpx_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)

  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
-      vpx_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)

  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
-      vpx_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)

  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
-      vpx_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
      vpx_sad32x32x4d)

  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
-      vpx_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
      vpx_sad64x64x4d)

  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
-      vpx_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
      vpx_sad16x16x4d)

  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
-      vpx_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8,
      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)

  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
-      vpx_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16,
      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)

  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
-      vpx_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8,
+      vpx_variance8x8, vpx_sub_pixel_variance8x8,
+      vpx_sub_pixel_avg_variance8x8,
      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)

  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
-      vpx_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
+      vpx_variance8x4, vpx_sub_pixel_variance8x4,
+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)

  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
-      vpx_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
+      vpx_variance4x8, vpx_sub_pixel_variance4x8,
+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)

  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
-      vpx_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4,
+      vpx_variance4x4, vpx_sub_pixel_variance4x4,
+      vpx_sub_pixel_avg_variance4x4,
      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)

 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@ -40,7 +40,7 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"

 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@ -35,7 +35,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"

 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
@ -298,7 +298,7 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
  }
 }

-static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
  switch (bsize) {
    case BLOCK_8X8:
      return vpx_mse8x8;
@ -315,13 +315,13 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
                                         const struct buf_2d *src,
                                         const struct buf_2d *ref) {
  unsigned int sse;
-  const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
  return sse;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                      int bd) {
  switch (bd) {
    default:
@ -368,7 +368,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
                                                const struct buf_2d *ref,
                                                int bd) {
  unsigned int sse;
-  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
  return sse;
 }
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@ -13,7 +13,7 @@
 #define VP9_ENCODER_VP9_MCOMP_H_

 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"

 #ifdef __cplusplus
 extern "C" {
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -37,7 +37,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"

 #define RD_THRESH_POW      1.25
 #define RD_MULT_EPB_RATIO  64
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -39,7 +39,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_aq_variance.h"

 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@ -1,380 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-static const uint8_t bilinear_filters[8][2] = {
-  { 128,   0, },
-  { 112,  16, },
-  {  96,  32, },
-  {  80,  48, },
-  {  64,  64, },
-  {  48,  80, },
-  {  32,  96, },
-  {  16, 112, },
-};
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// first-pass of 2-D separable filter.
-//
-// Produces int32_t output to retain precision for next pass. Two filter taps
-// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
-// defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// second-pass of 2-D separable filter.
-//
-// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
-// stride). It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#define SUBPIX_VAR(W, H) \
-unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
-\
-  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
-}
-
-#define SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
-\
-  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
-  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
-}
-
-SUBPIX_VAR(4, 4)
-SUBPIX_AVG_VAR(4, 4)
-
-SUBPIX_VAR(4, 8)
-SUBPIX_AVG_VAR(4, 8)
-
-SUBPIX_VAR(8, 4)
-SUBPIX_AVG_VAR(8, 4)
-
-SUBPIX_VAR(8, 8)
-SUBPIX_AVG_VAR(8, 8)
-
-SUBPIX_VAR(8, 16)
-SUBPIX_AVG_VAR(8, 16)
-
-SUBPIX_VAR(16, 8)
-SUBPIX_AVG_VAR(16, 8)
-
-SUBPIX_VAR(16, 16)
-SUBPIX_AVG_VAR(16, 16)
-
-SUBPIX_VAR(16, 32)
-SUBPIX_AVG_VAR(16, 32)
-
-SUBPIX_VAR(32, 16)
-SUBPIX_AVG_VAR(32, 16)
-
-SUBPIX_VAR(32, 32)
-SUBPIX_AVG_VAR(32, 32)
-
-SUBPIX_VAR(32, 64)
-SUBPIX_AVG_VAR(32, 64)
-
-SUBPIX_VAR(64, 32)
-SUBPIX_AVG_VAR(64, 32)
-
-SUBPIX_VAR(64, 64)
-SUBPIX_AVG_VAR(64, 64)
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const uint8_t *vp9_filter) {
-  unsigned int i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                             (int)src_ptr[pixel_step] * vp9_filter[1],
-                             FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-static void highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const uint8_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                             (int)src_ptr[pixel_step] * vp9_filter[1],
-                             FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                          dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-}
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                          dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-}
-
-HIGHBD_SUBPIX_VAR(4, 4)
-HIGHBD_SUBPIX_AVG_VAR(4, 4)
-
-HIGHBD_SUBPIX_VAR(4, 8)
-HIGHBD_SUBPIX_AVG_VAR(4, 8)
-
-HIGHBD_SUBPIX_VAR(8, 4)
-HIGHBD_SUBPIX_AVG_VAR(8, 4)
-
-HIGHBD_SUBPIX_VAR(8, 8)
-HIGHBD_SUBPIX_AVG_VAR(8, 8)
-
-HIGHBD_SUBPIX_VAR(8, 16)
-HIGHBD_SUBPIX_AVG_VAR(8, 16)
-
-HIGHBD_SUBPIX_VAR(16, 8)
-HIGHBD_SUBPIX_AVG_VAR(16, 8)
-
-HIGHBD_SUBPIX_VAR(16, 16)
-HIGHBD_SUBPIX_AVG_VAR(16, 16)
-
-HIGHBD_SUBPIX_VAR(16, 32)
-HIGHBD_SUBPIX_AVG_VAR(16, 32)
-
-HIGHBD_SUBPIX_VAR(32, 16)
-HIGHBD_SUBPIX_AVG_VAR(32, 16)
-
-HIGHBD_SUBPIX_VAR(32, 32)
-HIGHBD_SUBPIX_AVG_VAR(32, 32)
-
-HIGHBD_SUBPIX_VAR(32, 64)
-HIGHBD_SUBPIX_AVG_VAR(32, 64)
-
-HIGHBD_SUBPIX_VAR(64, 32)
-HIGHBD_SUBPIX_AVG_VAR(64, 32)
-
-HIGHBD_SUBPIX_VAR(64, 64)
-HIGHBD_SUBPIX_AVG_VAR(64, 64)
-#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@ -1,81 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_VARIANCE_H_
-#define VP9_ENCODER_VP9_VARIANCE_H_
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
-                                    int source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int ref_stride);
-
-typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
-                                        int source_stride,
-                                        const uint8_t *ref_ptr,
-                                        int ref_stride,
-                                        const uint8_t *second_pred);
-
-typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
-                                   int source_stride,
-                                   const uint8_t *ref_ptr,
-                                   int  ref_stride,
-                                   unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
-                                     int source_stride,
-                                     const uint8_t* const ref_ptr[],
-                                     int  ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
-                                          int source_stride,
-                                          const uint8_t *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
-                                                int source_stride,
-                                                int xoffset,
-                                                int yoffset,
-                                                const uint8_t *ref_ptr,
-                                                int Refstride,
-                                                unsigned int *sse);
-
-typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
-                                                   int source_stride,
-                                                   int xoffset,
-                                                   int yoffset,
-                                                   const uint8_t *ref_ptr,
-                                                   int Refstride,
-                                                   unsigned int *sse,
-                                                   const uint8_t *second_pred);
-
-typedef struct vp9_variance_vtable {
-  vp9_sad_fn_t               sdf;
-  vp9_sad_avg_fn_t           sdaf;
-  vp9_variance_fn_t          vf;
-  vp9_subpixvariance_fn_t    svf;
-  vp9_subp_avg_variance_fn_t svaf;
-  vp9_sad_multi_fn_t         sdx3f;
-  vp9_sad_multi_fn_t         sdx8f;
-  vp9_sad_multi_d_fn_t       sdx4df;
-} vp9_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_VARIANCE_H_
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@ -1,349 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
-                                               ptrdiff_t src_stride, \
-                                               int x_offset, int y_offset, \
-                                               const uint16_t *dst, \
-                                               ptrdiff_t dst_stride, \
-                                               int height, unsigned int *sse);
-#define DECLS(opt1, opt2) \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-// DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
-                                                        int src_stride, \
-                                                        int x_offset, \
-                                                        int y_offset, \
-                                                        const uint8_t *dst8, \
-                                                        int dst_stride, \
-                                                        uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, h, \
-                                                       &sse); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, \
-                                                       h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 48, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-        src + (start_row * src_stride), src_stride, \
-        x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 16 + (start_row * src_stride), src_stride, \
-          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 32 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 48 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-      }\
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-
-FNS(sse2, sse);
-
-#undef FNS
-#undef FN
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
-                                                   ptrdiff_t src_stride, \
-                                                   int x_offset, int y_offset, \
-                                                   const uint16_t *dst, \
-                                                   ptrdiff_t dst_stride, \
-                                                   const uint16_t *sec, \
-                                                   ptrdiff_t sec_stride, \
-                                                   int height, \
-                                                   unsigned int *sse);
-#define DECLS(opt1) \
-DECL(16, opt1) \
-DECL(8, opt1)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-               src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                  src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src, src_stride, x_offset, \
-                                            y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 16, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 32, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 48, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + (start_row * src_stride), src_stride, x_offset, \
-                y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 16 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-      } \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-
-#define FNS(opt1) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@ -1,525 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_variance.h"
-
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-};
-
-#define FILTER_SRC(filter) \
-  /* filter the source */ \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-  \
-  /* add 8 to source */ \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
-  \
-  /* divide source by 16 */ \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg) \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST \
-  /* load source and destination */ \
-  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
-  /* average between current and next stride source */ \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP \
-  /* expand each byte to 2 bytes */ \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
-  /* source - dest */ \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
-  /* caculate sum */ \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */ \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
-  \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
-  \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
-                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             int height,
-                                             unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             const uint8_t *sec,
-                                             int sec_stride,
-                                             int height,
-                                             unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-                 (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec+= sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec+= sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@ -1,104 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height,
-                                             unsigned int *sse);
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                                 int src_stride,
-                                                 int x_offset,
-                                                 int y_offset,
-                                                 const uint8_t *dst,
-                                                 int dst_stride,
-                                                 const uint8_t *sec,
-                                                 int sec_stride,
-                                                 int height,
-                                                 unsigned int *sseptr);
-
-unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  unsigned int sse1;
-  const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                  y_offset, dst, dst_stride,
-                                                  64, &sse1);
-  unsigned int sse2;
-  const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
-                                                  x_offset, y_offset,
-                                                  dst + 32, dst_stride,
-                                                  64, &sse2);
-  const int se = se1 + se2;
-  *sse = sse1 + sse2;
-  return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                 y_offset, dst, dst_stride,
-                                                 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
-  unsigned int sse1;
-  const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                      y_offset, dst, dst_stride,
-                                                      sec, 64, 64, &sse1);
-  unsigned int sse2;
-  const int se2 =
-      vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
-                                          y_offset, dst + 32, dst_stride,
-                                          sec + 32, 64, 64, &sse2);
-  const int se = se1 + se2;
-
-  *sse = sse1 + sse2;
-
-  return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
-  // processing 32 element in parallel
-  const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                     y_offset, dst, dst_stride,
-                                                     sec, 32, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
-}
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@ -1,182 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
-                                        ptrdiff_t src_stride, \
-                                        int x_offset, int y_offset, \
-                                        const uint8_t *dst, \
-                                        ptrdiff_t dst_stride, \
-                                        int height, unsigned int *sse, \
-                                        void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                     int src_stride, \
-                                                     int x_offset, \
-                                                     int y_offset, \
-                                                     const uint8_t *dst, \
-                                                     int dst_stride, \
-                                                     unsigned int *sse_ptr) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                y_offset, dst, dst_stride, \
-                                                h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                   x_offset, y_offset, \
-                                                   dst + 16, dst_stride, \
-                                                   h, &sse2, NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 32, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 48, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
-                                            ptrdiff_t src_stride, \
-                                            int x_offset, int y_offset, \
-                                            const uint8_t *dst, \
-                                            ptrdiff_t dst_stride, \
-                                            const uint8_t *sec, \
-                                            ptrdiff_t sec_stride, \
-                                            int height, unsigned int *sse, \
-                                            void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                         int src_stride, \
-                                                         int x_offset, \
-                                                         int y_offset, \
-                                                         const uint8_t *dst, \
-                                                         int dst_stride, \
-                                                         unsigned int *sseptr, \
-                                                         const uint8_t *sec) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                    y_offset, dst, dst_stride, \
-                                                    sec, w, h, &sse, NULL, \
-                                                    NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst + 16, dst_stride, \
-                                                       sec + 16, w, h, &sse2, \
-                                                       NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 32, dst_stride, \
-                                                     sec + 32, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 48, dst_stride, \
-                                                     sec + 48, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -131,7 +131,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c

 # common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -58,7 +58,6 @@ VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
-VP9_CX_SRCS-yes += encoder/vp9_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_encoder.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
@ -84,7 +83,6 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c

 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
-VP9_CX_SRCS-yes += encoder/vp9_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
@ -103,7 +101,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@ -114,12 +111,6 @@ endif
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
-endif
 endif

 ifeq ($(ARCH_X86_64),yes)
@ -143,14 +134,12 @@ endif
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@ -160,6 +149,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- a/vpx_dsp/arm/bilinear_filter_media.asm
+++ b/vpx_dsp/arm/bilinear_filter_media.asm
@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vpx_filter_block2d_second_pass_media|
+
+    END
--- a/vpx_dsp/arm/subpel_variance_media.c
+++ b/vpx_dsp/arm/subpel_variance_media.c
@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = {
+  { 128,   0 },
+  { 112,  16 },
+  {  96,  32 },
+  {  80,  48 },
+  {  64,  64 },
+  {  48,  80 },
+  {  32,  96 },
+  {  16, 112 }
+};
+
+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
+                                                    uint16_t *dst_ptr,
+                                                    uint32_t src_pitch,
+                                                    uint32_t height,
+                                                    uint32_t width,
+                                                    const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
+                                                     uint8_t *dst_ptr,
+                                                     int32_t src_pitch,
+                                                     uint32_t height,
+                                                     uint32_t width,
+                                                     const int16_t *filter);
+
+
+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
+                                             int src_pixels_per_line,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  uint16_t first_pass[10*8];
+  uint8_t  second_pass[8*8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line,
+                                          9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                           8, 8, 8, VFilter);
+
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                               dst_pixels_per_line, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
+                                               int src_pixels_per_line,
+                                               int xoffset,
+                                               int yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse) {
+  uint16_t first_pass[36*16];
+  uint8_t  second_pass[20*16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
+                                                dst_ptr, dst_pixels_per_line,
+                                                sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                  dst_pixels_per_line, sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@ -9,14 +9,13 @@
 */

 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"

 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/variance.h"

 static const uint8_t bilinear_filters[8][2] = {
  { 128,   0, },
@ -35,9 +34,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      int pixel_step,
                                      unsigned int output_height,
                                      unsigned int output_width,
-                                      const uint8_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@ -58,9 +57,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
-                                       const uint8_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i, j;
  for (i = 0; i < output_height; ++i) {
    for (j = 0; j < output_width; j += 16) {
@ -80,7 +79,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
  }
 }

-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
                                            int src_stride,
                                            int xoffset,
                                            int yoffset,
@ -98,7 +97,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }

-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
                                              int src_stride,
                                              int xoffset,
                                              int yoffset,
@ -116,7 +115,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }

-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
                                              int src_stride,
                                              int xoffset,
                                              int yoffset,
@ -134,7 +133,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }

-unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
                                              int src_stride,
                                              int xoffset,
                                              int yoffset,
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|

    ARM
    REQUIRE8
@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_h_armv6| PROC
+|vpx_variance_halfpixvar16x16_h_media| PROC

    stmfd   sp!, {r4-r12, lr}

--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|

    ARM
    REQUIRE8
@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+|vpx_variance_halfpixvar16x16_hv_media| PROC

    stmfd   sp!, {r4-r12, lr}

--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|

    ARM
    REQUIRE8
@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_v_armv6| PROC
+|vpx_variance_halfpixvar16x16_v_media| PROC

    stmfd   sp!, {r4-r12, lr}

--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
--- a/vp9/encoder/mips/msa/vp9_variance_msa.c
+++ b/vp9/encoder/mips/msa/vp9_variance_msa.c
@ -8,13 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"

-static const uint8_t bilinear_filters[8][2] = {
+static const uint8_t bilinear_filters_msa[8][2] = {
  { 128,   0, },
  { 112,  16, },
  {  96,  32, },
@ -707,8 +706,8 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);

-#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
-uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
                                                 int32_t src_stride,     \
                                                 int32_t xoffset,        \
                                                 int32_t yoffset,        \
@ -717,8 +716,8 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
                                                 uint32_t *sse) {        \
  int32_t diff;                                                          \
  uint32_t var;                                                          \
-  const uint8_t *h_filter = bilinear_filters[xoffset];                   \
-  const uint8_t *v_filter = bilinear_filters[yoffset];                   \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
                                                                         \
  if (yoffset) {                                                         \
    if (xoffset) {                                                       \
@ -749,20 +748,20 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
  return var;                                                            \
 }

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@ -14,13 +14,26 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"

-unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
-                                const unsigned char *b, int  b_stride) {
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
+                            const uint8_t *b, int  b_stride) {
  int distortion = 0;
  int r, c;

-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
      int diff = a[c] - b[c];
      distortion += diff * diff;
    }
@ -32,7 +45,7 @@ unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
  return distortion;
 }

-unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+uint32_t vpx_get_mb_ss_c(const int16_t *a) {
  unsigned int i, sum = 0;

  for (i = 0; i < 256; ++i) {
@ -42,16 +55,38 @@ unsigned int vpx_get_mb_ss_c(const int16_t *a) {
  return sum;
 }

+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
+                                       b, b_stride, sse);
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
+                                       b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
+                                       b, b_stride, sse);
+}
+
 static void variance(const uint8_t *a, int  a_stride,
                     const uint8_t *b, int  b_stride,
-                     int  w, int  h, unsigned int *sse, int *sum) {
+                     int  w, int  h, uint32_t *sse, int *sum) {
  int i, j;

  *sum = 0;
  *sse = 0;

-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
@ -62,15 +97,113 @@ static void variance(const uint8_t *a, int  a_stride,
  }
 }

+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
 #define VAR(W, H) \
-unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                       const uint8_t *b, int b_stride, \
-                                       unsigned int *sse) { \
+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) { \
  int sum; \
  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }

+#define SUBPIX_VAR(W, H) \
+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                             int xoffset, int  yoffset, \
+                                             const uint8_t *b, int b_stride, \
+                                             uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int  a_stride, \
+                                                 int xoffset, int  yoffset, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 uint32_t *sse, \
+                                                 const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+}
+
 /* Identical to the variance call except it takes an additional parameter, sum,
 * and returns that value using pass-by-reference instead of returning
 * sse - sum^2 / w*h
@ -78,7 +211,7 @@ unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 #define GET_VAR(W, H) \
 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
                             const uint8_t *b, int b_stride, \
-                             unsigned int *sse, int *sum) { \
+                             uint32_t *sse, int *sum) { \
  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
 }

@ -87,27 +220,33 @@ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
 * variable.
 */
 #define MSE(W, H) \
-unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                  const uint8_t *b, int b_stride, \
-                                  unsigned int *sse) { \
+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                              const uint8_t *b, int b_stride, \
+                              uint32_t *sse) { \
  int sum; \
  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse; \
 }

-VAR(64, 64)
-VAR(64, 32)
-VAR(32, 64)
-VAR(32, 32)
-VAR(32, 16)
-VAR(16, 32)
-VAR(16, 16)
-VAR(16, 8)
-VAR(8, 16)
-VAR(8, 8)
-VAR(8, 4)
-VAR(4, 8)
-VAR(4, 4)
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+    VAR(W, H) \
+    SUBPIX_VAR(W, H) \
+    SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)

 GET_VAR(16, 16)
 GET_VAR(8, 8)
@ -117,12 +256,13 @@ MSE(16, 8)
 MSE(8, 16)
 MSE(8, 8)

-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                         int height, const uint8_t *ref, int ref_stride) {
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                         int width, int height,
+                         const uint8_t *ref, int ref_stride) {
  int i, j;

-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
@ -143,8 +283,8 @@ static void highbd_variance64(const uint8_t *a8, int  a_stride,
  *sum = 0;
  *sse = 0;

-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
@ -156,60 +296,60 @@ static void highbd_variance64(const uint8_t *a8, int  a_stride,

 static void highbd_8_variance(const uint8_t *a8, int  a_stride,
                              const uint8_t *b8, int  b_stride,
-                              int w, int h, unsigned int *sse, int *sum) {
+                              int w, int h, uint32_t *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
+  *sse = (uint32_t)sse_long;
  *sum = (int)sum_long;
 }

 static void highbd_10_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                               int w, int h, unsigned int *sse, int *sum) {
+                               int w, int h, uint32_t *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }

 static void highbd_12_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                               int w, int h, unsigned int *sse, int *sum) {
+                               int w, int h, uint32_t *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }

 #define HIGHBD_VAR(W, H) \
-unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
-                                                int a_stride, \
-                                                const uint8_t *b, \
-                                                int b_stride, \
-                                                unsigned int *sse) { \
+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                            int a_stride, \
+                                            const uint8_t *b, \
+                                            int b_stride, \
+                                            uint32_t *sse) { \
  int sum; \
  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
-unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
  int sum; \
  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
-unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
  int sum; \
  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
@ -217,54 +357,243 @@ unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \

 #define HIGHBD_GET_VAR(S) \
 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    unsigned int *sse, int *sum) { \
+                                      const uint8_t *ref, int ref_stride, \
+                                      uint32_t *sse, int *sum) { \
  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
+                                       uint32_t *sse, int *sum) { \
  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
+                                       uint32_t *sse, int *sum) { \
  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 }

 #define HIGHBD_MSE(W, H) \
-unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
-                                         int src_stride, \
-                                         const uint8_t *ref, \
-                                         int ref_stride, \
-                                         unsigned int *sse) { \
+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                       int src_stride, \
+                                       const uint8_t *ref, \
+                                       int ref_stride, \
+                                       uint32_t *sse) { \
  int sum; \
  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
-unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
  int sum; \
  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
-unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
  int sum; \
  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 }

+static void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+    HIGHBD_VAR(W, H) \
+    HIGHBD_SUBPIX_VAR(W, H) \
+    HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)

@ -273,28 +602,14 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)

-HIGHBD_VAR(64, 64)
-HIGHBD_VAR(64, 32)
-HIGHBD_VAR(32, 64)
-HIGHBD_VAR(32, 32)
-HIGHBD_VAR(32, 16)
-HIGHBD_VAR(16, 32)
-HIGHBD_VAR(16, 16)
-HIGHBD_VAR(16, 8)
-HIGHBD_VAR(8, 16)
-HIGHBD_VAR(8, 8)
-HIGHBD_VAR(8, 4)
-HIGHBD_VAR(4, 8)
-HIGHBD_VAR(4, 4)
-
 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
                              int width, int height, const uint8_t *ref8,
                              int ref_stride) {
  int i, j;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VARIANCE_H_
+#define VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+                                    const uint8_t *b_ptr, int b_stride);
+
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+                                        const uint8_t *b_ptr, int b_stride,
+                                        const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
+                                  uint8_t *b, int b_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride,
+                                     unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+                                                   int a_stride,
+                                                   int xoffset, int yoffset,
+                                                   const uint8_t *b_ptr,
+                                                   int b_stride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+#if CONFIG_VP8
+typedef struct variance_vtable {
+  vpx_sad_fn_t            sdf;
+  vpx_variance_fn_t       vf;
+  vpx_subpixvariance_fn_t svf;
+  vpx_variance_fn_t       svf_halfpix_h;
+  vpx_variance_fn_t       svf_halfpix_v;
+  vpx_variance_fn_t       svf_halfpix_hv;
+  vpx_sad_multi_fn_t      sdx3f;
+  vpx_sad_multi_fn_t      sdx8f;
+  vpx_sad_multi_d_fn_t    sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+  vp8_copy32xn_fn_t       copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif  // CONFIG_VP8
+
+#if CONFIG_VP9
+typedef struct vp9_variance_vtable {
+  vpx_sad_fn_t               sdf;
+  vpx_sad_avg_fn_t           sdaf;
+  vpx_variance_fn_t          vf;
+  vpx_subpixvariance_fn_t    svf;
+  vpx_subp_avg_variance_fn_t svaf;
+  vpx_sad_multi_fn_t         sdx3f;
+  vpx_sad_multi_fn_t         sdx8f;
+  vpx_sad_multi_d_fn_t       sdx4df;
+} vp9_variance_fn_ptr_t;
+#endif  // CONFIG_VP9
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VARIANCE_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -10,6 +10,8 @@

 DSP_SRCS-yes += vpx_dsp.mk

+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
@ -19,7 +21,6 @@ DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c

-DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c

@ -45,21 +46,36 @@ endif  # CONFIG_ENCODERS

 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
 DSP_SRCS-yes            += variance.c
+DSP_SRCS-yes            += variance.h

+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)
 DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c

 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c

 DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c

+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
+endif  # CONFIG_USE_X86INC
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -412,6 +412,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {

+#
+# Variance
+#
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;

@ -451,7 +454,9 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance4x4 mmx sse2 msa/;

-
+#
+# Specialty Variance
+#
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;

@ -478,6 +483,99 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int

 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";

+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance64x64 sse2/;
@ -615,6 +713,226 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vpx_highbd_12_mse8x8 sse2/;

  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+  #
+  # Subpixel Variance
+  #
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"

 SECTION_RODATA
@ -30,7 +32,7 @@ bilin_filter_m_sse2: times  8 dw 16

 SECTION .text

-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@ -8,9 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"

-#include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"

 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
@ -243,3 +241,341 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
+
+#if CONFIG_USE_X86INC
+#define DECL(w, opt) \
+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                                 ptrdiff_t src_stride, \
+                                                 int x_offset, int y_offset, \
+                                                 const uint16_t *dst, \
+                                                 ptrdiff_t dst_stride, \
+                                                 int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+// TODO(johannkoenig): enable the ssse3 or delete
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                          int src_stride, \
+                                                          int x_offset, \
+                                                          int y_offset, \
+                                                          const uint8_t *dst8, \
+                                                          int dst_stride, \
+                                                          uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"

 SECTION_RODATA
@ -39,7 +41,7 @@ bilin_filter_m_ssse3: times  8 db 16,  0

 SECTION .text

-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@ -91,3 +91,93 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
                sse, &sum, vpx_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height,
+                                             unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                                 int src_stride,
+                                                 int x_offset,
+                                                 int y_offset,
+                                                 const uint8_t *dst,
+                                                 int dst_stride,
+                                                 const uint8_t *sec,
+                                                 int sec_stride,
+                                                 int height,
+                                                 unsigned int *sseptr);
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                  y_offset, dst, dst_stride,
+                                                  64, &sse1);
+  unsigned int sse2;
+  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                                  x_offset, y_offset,
+                                                  dst + 32, dst_stride,
+                                                  64, &sse2);
+  const int se = se1 + se2;
+  *sse = sse1 + sse2;
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                      y_offset, dst, dst_stride,
+                                                      sec, 64, 64, &sse1);
+  unsigned int sse2;
+  const int se2 =
+    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                        y_offset, dst + 32, dst_stride,
+                                        sec + 32, 64, 64, &sse2);
+  const int se = se1 + se2;
+
+  *sse = sse1 + sse2;
+
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  // Process 32 elements in parallel.
+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                     y_offset, dst, dst_stride,
+                                                     sec, 32, 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
--- a/vpx_dsp/x86/variance_impl_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c
@ -11,6 +11,27 @@
 #include <immintrin.h>  // AVX2

 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+

 void vpx_get16x16var_avx2(const unsigned char *src_ptr,
                          int source_stride,
@ -213,3 +234,494 @@ void vpx_get32x32var_avx2(const unsigned char *src_ptr,
      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
    }
 }
+
+#define FILTER_SRC(filter) \
+  /* filter the source */ \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+  \
+  /* add 8 to source */ \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+  \
+  /* divide source by 16 */ \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+  /* load source and destination */ \
+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  /* average between current and next stride source */ \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+  /* expand each byte to 2 bytes */ \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+  /* source - dest */ \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+  /* caculate sum */ \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */ \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+  \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+  \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             const uint8_t *sec,
+                                             int sec_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+                 (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@ -11,6 +11,8 @@

 %include "vpx_ports/x86_abi_support.asm"

+%define mmx_filter_shift            7
+
 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
 global sym(vpx_get_mb_ss_mmx) PRIVATE
 sym(vpx_get_mb_ss_mmx):
@ -52,7 +54,6 @@ sym(vpx_get_mb_ss_mmx):
        movsxd      rcx, dword ptr [rsp+4]
        add         rax, rcx

-
    ; begin epilog
    add rsp, 8
    pop rdi
@ -62,7 +63,6 @@ sym(vpx_get_mb_ss_mmx):
    pop         rbp
    ret

-
 ;void vpx_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
@ -83,7 +83,6 @@ sym(vpx_get8x8var_mmx):
    sub         rsp, 16
    ; end prolog

-
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
@ -117,7 +116,6 @@ sym(vpx_get8x8var_mmx):
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7

-
        ; Row 2
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
@ -298,7 +296,6 @@ sym(vpx_get8x8var_mmx):
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0

-
    ; begin epilog
    add rsp, 16
    pop rbx
@ -308,8 +305,6 @@ sym(vpx_get8x8var_mmx):
    pop         rbp
    ret

-
-
 ;void
 ;vpx_get4x4var_mmx
 ;(
@ -331,7 +326,6 @@ sym(vpx_get4x4var_mmx):
    sub         rsp, 16
    ; end prolog

-
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
@ -354,7 +348,6 @@ sym(vpx_get4x4var_mmx):
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7

-
        ; Row 2
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
@ -393,7 +386,6 @@ sym(vpx_get4x4var_mmx):
        pmaddwd     mm0, mm0                    ; square and accumulate
        paddd       mm7, mm0                    ; accumulate in mm7

-
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
@ -413,7 +405,6 @@ sym(vpx_get4x4var_mmx):
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0

-
    ; begin epilog
    add rsp, 16
    pop rbx
@ -422,3 +413,332 @@ sym(vpx_get4x4var_mmx):
    UNSHADOW_ARGS
    pop         rbp
    ret
+
+;void vpx_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@ -10,12 +10,45 @@

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
+  { 128, 128, 128, 128,   0,   0,   0,   0 },
+  { 112, 112, 112, 112,  16,  16,  16,  16 },
+  {  96,  96,  96,  96,  32,  32,  32,  32 },
+  {  80,  80,  80,  80,  48,  48,  48,  48 },
+  {  64,  64,  64,  64,  64,  64,  64,  64 },
+  {  48,  48,  48,  48,  80,  80,  80,  80 },
+  {  32,  32,  32,  32,  96,  96,  96,  96 },
+  {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
 extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
                              unsigned int *sse, int *sum);

-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
-                                 const unsigned char *b, int  b_stride,
+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
+                                              int ref_pixels_per_line,
+                                              const unsigned char *src_ptr,
+                                              int src_pixels_per_line,
+                                              const int16_t *HFilter,
+                                              const int16_t *VFilter,
+                                              int *sum,
+                                              unsigned int *sumsquared);
+
+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
+                                           int ref_pixels_per_line,
+                                           const unsigned char *src_ptr,
+                                           int src_pixels_per_line,
+                                           unsigned int Height,
+                                           const int16_t *HFilter,
+                                           const int16_t *VFilter,
+                                           int *sum,
+                                           unsigned int *sumsquared);
+
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
@ -25,8 +58,8 @@ unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
    return (var - (((unsigned int)avg * avg) >> 4));
 }

-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
-                                 const unsigned char *b, int  b_stride,
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
@ -37,8 +70,8 @@ unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
    return (var - (((unsigned int)avg * avg) >> 6));
 }

-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
-                              const unsigned char *b, int  b_stride,
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+                              const unsigned char *b, int b_stride,
                              unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3;
@ -55,8 +88,8 @@ unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
    return var;
 }

-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
-                                   const unsigned char *b, int  b_stride,
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+                                   const unsigned char *b, int b_stride,
                                   unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3, avg;
@ -74,8 +107,8 @@ unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
    return (var - (((unsigned int)avg * avg) >> 8));
 }

-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
-                                  const unsigned char *b, int  b_stride,
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
@ -89,8 +122,8 @@ unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
    return (var - (((unsigned int)avg * avg) >> 7));
 }

-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,
-                                  const unsigned char *b, int  b_stride,
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
@ -105,3 +138,112 @@ unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,

    return (var - (((unsigned int)avg * avg) >> 7));
 }
+
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
+                                      bilinear_filters_mmx[xoffset],
+                                      bilinear_filters_mmx[yoffset],
+                                      &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+}
+
+
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    uint32_t xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
+}
+
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
+}
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@ -307,3 +307,171 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                          ptrdiff_t src_stride, \
+                                          int x_offset, int y_offset, \
+                                          const uint8_t *dst, \
+                                          ptrdiff_t dst_stride, \
+                                          int height, unsigned int *sse, \
+                                          void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+  DECL(4, opt2); \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2, NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse, \
+                                            void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse, NULL, \
+                                                    NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2, \
+                                                       NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC