Merge "SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance()."

2013-06-21 12:49:43 -07:00 · 2013-06-21 12:49:43 -07:00 · 9a480482cb
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@ -76,6 +76,34 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
  return sse - (((int64_t) se * se) >> (l2w + l2h));
 }

+static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
+                                            const uint8_t *src,
+                                            const uint8_t *second_pred,
+                                            int l2w, int l2h,
+                                            int xoff, int yoff,
+                                            unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+      const int r = a + (((b - a) * yoff + 8) >> 4);
+      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
 template<typename VarianceFunctionType>
 class VarianceTest :
    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@ -174,6 +202,7 @@ class SubpelVarianceTest :
    rnd(ACMRandom::DeterministicSeed());
    block_size_ = width_ * height_;
    src_ = new uint8_t[block_size_];
+    sec_ = new uint8_t[block_size_];
    ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
    ASSERT_TRUE(src_ != NULL);
    ASSERT_TRUE(ref_ != NULL);
@ -182,6 +211,7 @@ class SubpelVarianceTest :
  virtual void TearDown() {
    delete[] src_;
    delete[] ref_;
+    delete[] sec_;
  }

 protected:
@ -190,6 +220,7 @@ class SubpelVarianceTest :
  ACMRandom rnd;
  uint8_t *src_;
  uint8_t *ref_;
+  uint8_t *sec_;
  int width_, log2width_;
  int height_, log2height_;
  int block_size_;
@ -217,6 +248,29 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
  }
 }

+template<>
+void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+  for (int x = 0; x < 16; ++x) {
+    for (int y = 0; y < 16; ++y) {
+      for (int j = 0; j < block_size_; j++) {
+        src_[j] = rnd.Rand8();
+        sec_[j] = rnd.Rand8();
+      }
+      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        ref_[j] = rnd.Rand8();
+      }
+      unsigned int sse1, sse2;
+      const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                 src_, width_, &sse1, sec_);
+      const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
+                                                        log2width_, log2height_,
+                                                        x, y, &sse2);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
 // -----------------------------------------------------------------------------
 // VP8 test cases.

@ -283,10 +337,12 @@ namespace vp9 {
 #if CONFIG_VP9_ENCODER
 typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
 typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;

 TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
 TEST_P(VP9VarianceTest, Ref) { RefTest(); }
 TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }

 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
@ -360,6 +416,48 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_variance64x32_c),
                      make_tuple(6, 6, subpel_variance64x64_c)));

+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
+    vp9_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
+    vp9_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
+    vp9_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
+    vp9_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
+    vp9_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
+    vp9_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
+    vp9_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
+    vp9_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
+    vp9_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
+    vp9_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
+    vp9_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
+    vp9_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
+    vp9_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
+                      make_tuple(2, 3, subpel_avg_variance4x8_c),
+                      make_tuple(3, 2, subpel_avg_variance8x4_c),
+                      make_tuple(3, 3, subpel_avg_variance8x8_c),
+                      make_tuple(3, 4, subpel_avg_variance8x16_c),
+                      make_tuple(4, 3, subpel_avg_variance16x8_c),
+                      make_tuple(4, 4, subpel_avg_variance16x16_c),
+                      make_tuple(4, 5, subpel_avg_variance16x32_c),
+                      make_tuple(5, 4, subpel_avg_variance32x16_c),
+                      make_tuple(5, 5, subpel_avg_variance32x32_c),
+                      make_tuple(5, 6, subpel_avg_variance32x64_c),
+                      make_tuple(6, 5, subpel_avg_variance64x32_c),
+                      make_tuple(6, 6, subpel_avg_variance64x64_c)));
+
 #if HAVE_MMX
 const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
 const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
@ -446,6 +544,48 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(5, 6, subpel_variance32x64_sse2),
                      make_tuple(6, 5, subpel_variance64x32_sse2),
                      make_tuple(6, 6, subpel_variance64x64_sse2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
+    vp9_sub_pixel_avg_variance4x4_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
+    vp9_sub_pixel_avg_variance4x8_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
+    vp9_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
+    vp9_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
+    vp9_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
+    vp9_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
+    vp9_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
+    vp9_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
+    vp9_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
+    vp9_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
+    vp9_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
+    vp9_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
+    vp9_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
+                      make_tuple(2, 3, subpel_avg_variance4x8_sse),
+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2),
+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2),
+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2),
+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2),
+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2),
+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2),
+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2),
+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2),
+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2),
+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
 #endif

 #if HAVE_SSSE3
@ -490,6 +630,48 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(5, 6, subpel_variance32x64_ssse3),
                      make_tuple(6, 5, subpel_variance64x32_ssse3),
                      make_tuple(6, 6, subpel_variance64x64_ssse3)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
+    vp9_sub_pixel_avg_variance4x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
+    vp9_sub_pixel_avg_variance4x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
+    vp9_sub_pixel_avg_variance8x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
+    vp9_sub_pixel_avg_variance8x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
+    vp9_sub_pixel_avg_variance8x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
+    vp9_sub_pixel_avg_variance16x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
+    vp9_sub_pixel_avg_variance16x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
+    vp9_sub_pixel_avg_variance16x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
+    vp9_sub_pixel_avg_variance32x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
+    vp9_sub_pixel_avg_variance32x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
+    vp9_sub_pixel_avg_variance32x64_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
+    vp9_sub_pixel_avg_variance64x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
+    vp9_sub_pixel_avg_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
+                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
+                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
 #endif  // CONFIG_VP9_ENCODER

--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -269,81 +269,81 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int
 specialize vp9_sub_pixel_variance64x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3

 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x4 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x8 sse ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3

 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse ssse3
 #vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

 prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3

 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@ -116,7 +116,7 @@ bilin_filter_m_ssse3: times  8 db 16,  0
  RET
 %endmacro

-%macro SUBPEL_VARIANCE 1 ; W
+%macro SUBPEL_VARIANCE 1-2 0 ; W
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
 %define filter_idx_shift 4
@ -128,12 +128,38 @@ bilin_filter_m_ssse3: times  8 db 16,  0
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 %ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                              x_offset, y_offset, \
+                                              dst, dst_stride, \
+                                              sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
                                          dst, dst_stride, height, sse
+%endif
+%define h heightd
 %define bilin_filter sseq
 %else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                                         x_offset, y_offset, \
+                                                         dst, dst_stride, \
+                                                         sec, sec_stride, \
+                                                         height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
                                          dst, dst_stride, height, sse
+%define h heightd
+%endif
 %define bilin_filter bilin_filter_m
 %endif
  ASSERT               %1 <= 16         ; m6 overflows if w > 16
@ -143,7 +169,10 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  ; could perhaps use it for something more productive then
  pxor                 m5, m5           ; dedicated zero register
 %if %1 < 16
-  sar             heightd, 1
+  sar                   h, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
 %endif

  ; FIXME(rbultje) replace by jumptable?
@ -158,30 +187,55 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %if %1 == 16
  movu                 m0, [srcq]
  mova                 m1, [dstq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
  punpckhbw            m3, m1, m5
  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+%endif
+%else ; !avg
  movh                 m2, [srcq+src_strideq]
+%endif
  movh                 m1, [dstq]
  movh                 m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
  punpcklbw            m0, m5
  punpcklbw            m2, m5
  punpcklbw            m3, m5
  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_zero_y_zero_loop
  STORE_AND_RET

@ -196,18 +250,40 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  movu                 m4, [srcq+src_strideq]
  mova                 m1, [dstq]
  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
  punpckhbw            m2, m0, m5
  punpcklbw            m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+  punpckldq            m2, [srcq+src_strideq*2]
+%endif
+  movh                 m1, [dstq]
+%if mmsize == 16
+  movlhps              m0, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+%endif
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
  movh                 m4, [srcq+src_strideq*2]
  movh                 m1, [dstq]
  pavgb                m0, m2
@ -217,12 +293,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  punpcklbw            m2, m5
  punpcklbw            m3, m5
  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_zero_y_half_loop
  STORE_AND_RET

@ -280,13 +360,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  psraw                m2, 4
  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpckhbw            m3, m1, m5
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m2, [srcq+src_strideq]
@ -318,13 +404,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  psraw                m0, 4
  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_zero_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
@ -345,18 +441,37 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  movu                 m4, [srcq+1]
  mova                 m1, [dstq]
  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
  punpckhbw            m2, m0, m5
  punpcklbw            m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+  punpckldq            m4, [srcq+src_strideq+1]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
  movh                 m2, [srcq+src_strideq]
  movh                 m1, [dstq]
  pavgb                m0, m4
@ -367,12 +482,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  punpcklbw            m2, m5
  punpcklbw            m3, m5
  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_half_y_zero_loop
  STORE_AND_RET

@ -391,17 +510,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  movu                 m3, [srcq+1]
  mova                 m1, [dstq]
  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
  punpckhbw            m2, m0, m5
  punpcklbw            m0, m5
-  punpckhbw            m3, m1, m5
  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m3, [srcq+1]
@ -410,6 +535,31 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 .x_half_y_half_loop:
  movh                 m2, [srcq]
  movh                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  punpckldq            m2, [srcq+src_strideq]
+  punpckldq            m3, [srcq+src_strideq+1]
+%endif
+  pavgb                m2, m3
+%if mmsize == 16
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+  pshufw               m4, m2, 0xe
+%endif
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
  movh                 m4, [srcq+src_strideq]
  movh                 m1, [srcq+src_strideq+1]
  pavgb                m2, m3
@ -422,13 +572,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  punpcklbw            m2, m5
  punpcklbw            m3, m5
  punpcklbw            m1, m5
+%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_half_y_half_loop
  STORE_AND_RET

@ -488,13 +642,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  punpckhbw            m3, m1, m5
  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m3, [srcq+1]
@ -536,14 +696,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  psraw                m0, 4
  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_half_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
@ -602,13 +772,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  psraw                m2, 4
  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpckhbw            m3, m1, m5
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m1, [srcq+1]
@ -642,13 +818,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
  psraw                m0, 4
  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_other_y_zero_loop
 %undef filter_x_a
 %undef filter_x_b
@ -724,8 +910,6 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  pavgb                m0, m4
  punpckhbw            m3, m1, m5
  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
 %else
  punpckhbw            m2, m4, m5
  punpckhbw            m1, m3, m5
@ -750,15 +934,18 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  packuswb             m4, m2
  punpcklbw            m1, m5
  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
  punpckhbw            m2, m0, m5
  punpcklbw            m0, m5
-%endif
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m1, [srcq+1]
@ -810,6 +997,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  psraw                m4, 4
  pavgw                m0, m2
  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
  punpcklbw            m3, m5
  punpcklbw            m1, m5
  SUM_SSE              m0, m1, m2, m3, m6, m7
@ -817,8 +1011,11 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_other_y_half_loop
 %undef filter_x_a
 %undef filter_x_b
@ -941,13 +1138,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  punpckhbw            m3, m1, m5
  psraw                m0, 4
  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
 %endif
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  add                srcq, src_strideq
  add                dstq, dst_strideq
-  dec             heightd
 %else ; %1 < 16
  movh                 m0, [srcq]
  movh                 m1, [srcq+1]
@ -1025,14 +1228,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
  psraw                m2, 4
  punpcklbw            m3, m5
  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
 %endif
  SUM_SSE              m0, m1, m2, m3, m6, m7
  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]
  lea                dstq, [dstq+dst_strideq*2]
-  dec             heightd
 %endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
  jg .x_other_y_other_loop
 %undef filter_x_a
 %undef filter_x_b
@ -1059,3 +1272,15 @@ SUBPEL_VARIANCE  4
 INIT_XMM ssse3
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@ -343,29 +343,22 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
  return (var - (((int64_t)avg * avg) >> 11));
 }

-#define DECLS(opt1, opt2) \
-int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
-                                     ptrdiff_t src_stride, \
-                                     int x_offset, int y_offset, \
-                                     const uint8_t *dst, \
-                                     ptrdiff_t dst_stride, \
-                                     int height, unsigned int *sse); \
-int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
-                                     ptrdiff_t src_stride, \
-                                     int x_offset, int y_offset, \
-                                     const uint8_t *dst, \
-                                     ptrdiff_t dst_stride, \
-                                     int height, unsigned int *sse); \
-int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
                                        ptrdiff_t src_stride, \
                                        int x_offset, int y_offset, \
                                        const uint8_t *dst, \
                                        ptrdiff_t dst_stride, \
                                        int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)

 DECLS(sse2, sse);
 DECLS(ssse3, ssse3);
 #undef DECLS
+#undef DECL

 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
@ -427,6 +420,86 @@ FNS(ssse3, ssse3);
 #undef FNS
 #undef FN

+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
 unsigned int vp9_variance_halfpixvar16x16_h_wmt(
  const unsigned char *src_ptr,
  int  src_pixels_per_line,