Add optimized vpx_blend_mask6

This is to replace vp10/common/reconinter.c:build_masked_compound. Functionality is equivalent, but the interface is slightly more generic. Total encoder speedup with ext-inter: ~7.5% Change-Id: Iee18b83ae324ffc9c7f7dc16d4b2b06adb4d4305
2016-05-20 16:33:12 +01:00 · 2016-05-20 16:33:12 +01:00 · a661bc87c4
--- a/test/assertion_helpers.h
+++ b/test/assertion_helpers.h
@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef TEST_ASSERTION_HELPERS_H_
+#define TEST_ASSERTION_HELPERS_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace libvpx_test {
+namespace assertion_helpers {
+
+// Arrays (1D) are element-wise equal
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEq(const E (&a)[n],
+                                    const E (&b)[n]) {
+  for (size_t i = 0; i < n; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// within the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n],
+                                          const E (&b)[n],
+                                          const size_t lo,
+                                          const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = lo; i < hi; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// outside the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n],
+                                           const E (&b)[n],
+                                           const size_t lo,
+                                           const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo <= i && i < hi)
+      continue;
+
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEq(const E (&a)[n][m],
+                                    const E (&b)[n][m]) {
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < m; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// within the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n][m],
+                                          const E (&b)[n][m],
+                                          const size_t lo0,
+                                          const size_t hi0,
+                                          const size_t lo1,
+                                          const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = lo0; i < hi0; i++) {
+    for (size_t j = lo1; j < hi1; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// outside the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t lo0,
+                                           const size_t hi0,
+                                           const size_t lo1,
+                                           const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo0 <= i && i < hi0)
+      continue;
+
+    for (size_t j = 0; j < m; j++) {
+      if (lo1 <= j && j < hi1)
+        continue;
+
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// at corresponding linear indices specified by rows/cols/stride/offset
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqWithin(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t stridea,
+                                           const size_t strideb,
+                                           const size_t offseta,
+                                           const size_t offsetb,
+                                           const size_t rows,
+                                           const size_t cols) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stridea <= m);
+  assert(strideb <= m);
+  assert(cols <= stridea);
+  assert(cols <= strideb);
+  assert(offseta < n * m);
+  assert(offsetb < n * m);
+  assert(offseta + (rows - 1) * stridea + (cols - 1) < n * m);
+  assert(offsetb + (rows - 1) * strideb + (cols - 1) < n * m);
+
+  const E *pa = &a[0][0] + offseta;
+  const E *pb = &b[0][0] + offsetb;
+
+  for (size_t r = 0 ; r < rows ; r++) {
+    for (size_t c = 0 ; c < cols ; c++) {
+      const E &va = pa[c];
+      const E &vb = pb[c];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at linear index "
+          << "[" << pa - &a[0][0]  << "] vs [" << pb - &b[0][0]  << "]"
+          << " row=" << r << " col=" << c
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+    pa += stridea;
+    pb += strideb;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// except at corresponding linear indices specified by
+// rows/cols/stride/offset.
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqOutside(const E (&a)[n][m],
+                                            const E (&b)[n][m],
+                                            const size_t stride,
+                                            const size_t offset,
+                                            const size_t rows,
+                                            const size_t cols ) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stride <= m);
+  assert(cols <= stride);
+  assert(offset < n * m);
+  assert(offset + (rows - 1) * stride + (cols - 1) < n * m);
+
+  const E *const pa = &a[0][0];
+  const E *const pb = &b[0][0];
+
+  size_t idx = 0;
+  size_t r = 0;
+  size_t end = offset;  // beginning of first row
+
+  while (idx < n * m) {
+    while (idx < end) {   // until beginning of row or end of buffer
+      const E &va = pa[idx];
+      const E &vb = pb[idx];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << idx / m << "][" << idx % m << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+
+      idx++;
+    }
+
+    // Move past row end
+    idx += cols;
+
+    if (++r < rows) {
+      // Move to next row
+      end += stride;
+    } else {
+      // Move to end of buffer
+      end = n * m;
+    }
+  }
+
+  // Sanity check
+  assert(idx == n * m + cols);
+
+  return ::testing::AssertionSuccess();
+}
+
+}   // namespace assertion_helpers
+}   // namespace libvpx_test
+
+#endif  // TEST_ASSERTION_HELPERS_H_
--- a/test/blend_mask6_test.cc
+++ b/test/blend_mask6_test.cc
@ -0,0 +1,311 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/randomise.h"
+#include "test/snapshot.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/assertion_helpers.h"
+#include "vp10/common/enums.h"
+
+using libvpx_test::assertion_helpers::BuffersEqWithin;
+using libvpx_test::assertion_helpers::BuffersEqOutside;
+using libvpx_test::assertion_helpers::ArraysEq;
+using libvpx_test::FunctionEquivalenceTest;
+using libvpx_test::Snapshot;
+using libvpx_test::Randomise;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendMask6Test : public FunctionEquivalenceTest<F> {
+ protected:
+  virtual ~BlendMask6Test() {}
+
+  virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+  void Common() {
+    w = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+    h = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+
+    randomise(subx);
+    randomise(suby);
+
+    randomise(dst_offset, 0, 32);
+    randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src0_offset, 0, 32);
+    randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src1_offset, 0, 32);
+    randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1);
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (randomise.uniform<int>(3)) {
+      case 0:   // Separate sources
+        p_src0 = &src0[0][0];
+        p_src1 = &src1[0][0];
+        break;
+      case 1:   // src0 == dst
+        p_src0 = &dst_tst[0][0];
+        src0_stride = dst_stride;
+        src0_offset = dst_offset;
+        p_src1 = &src1[0][0];
+        break;
+      case 2:   // src1 == dst
+        p_src0 = &src0[0][0];
+        p_src1 = &dst_tst[0][0];
+        src1_stride = dst_stride;
+        src1_offset = dst_offset;
+        break;
+      default:
+        FAIL();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // Prepare
+    //////////////////////////////////////////////////////////////////////////
+
+    snapshot(dst_ref);
+    snapshot(dst_tst);
+
+    snapshot(src0);
+    snapshot(src1);
+
+    snapshot(mask);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Execute
+    //////////////////////////////////////////////////////////////////////////
+
+    Execute(p_src0, p_src1);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Check
+    //////////////////////////////////////////////////////////////////////////
+
+    ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst,
+                                dst_stride, dst_stride,
+                                dst_offset, dst_offset,
+                                h, w));
+
+    ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0));
+    ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1));
+    ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+  }
+
+  Snapshot snapshot;
+  Randomise randomise;
+
+  T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t dst_stride;
+  size_t dst_offset;
+
+  T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src0_stride;
+  size_t src0_offset;
+
+  T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src1_stride;
+  size_t src1_offset;
+
+  uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE];
+  size_t mask_stride;
+
+  int w;
+  int h;
+
+  bool suby;
+  bool subx;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                      uint8_t *src0, uint32_t src0_stride,
+                      uint8_t *src1, uint32_t src1_stride,
+                      const uint8_t *mask, uint32_t mask_stride,
+                      int h, int w, int suby, int subx);
+
+class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+    ref_func_(&dst_ref[0][dst_offset], dst_stride,
+              p_src0 + src0_offset, src0_stride,
+              p_src1 + src1_offset, src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(&dst_tst[0][dst_offset], dst_stride,
+                p_src0 + src0_offset, src0_stride,
+                p_src1 + src1_offset, src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx));
+  }
+};
+
+TEST_P(BlendMask6Test8B, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref);
+    randomise(dst_tst);
+
+    randomise(src0);
+    randomise(src1);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6Test8B, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref, 254, 256);
+    randomise(dst_tst, 254, 256);
+
+    randomise(src0, 254, 256);
+    randomise(src1, 254, 256);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6Test8B,
+  ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int suby, int subx, int bd);
+
+class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+ protected:
+  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx, bit_depth);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx, bit_depth));
+  }
+
+  int bit_depth;
+};
+
+TEST_P(BlendMask6TestHBD, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+
+    randomise(dst_ref, hi);
+    randomise(dst_tst, hi);
+
+    randomise(src0, hi);
+    randomise(src1, hi);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6TestHBD, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+    const int lo = hi - 2;
+
+    randomise(dst_ref, lo, hi);
+    randomise(dst_tst, lo, hi);
+
+    randomise(src0, lo, hi);
+    randomise(src1, lo, hi);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6TestHBD,
+  ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
+                               &vpx_highbd_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h
@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+namespace libvpx_test {
+template <typename T>
+class FunctionEquivalenceTest :
+  public ::testing::TestWithParam< std::tr1::tuple< T, T > > {
+ public:
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() {
+    ref_func_ = std::tr1::get<0>(this->GetParam());
+    tst_func_ = std::tr1::get<1>(this->GetParam());
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  T ref_func_;
+  T tst_func_;
+};
+
+}   // namespace libvpx_test
+#endif  // TEST_FUNCTION_EQUIVALENCE_TEST_H_
--- a/test/randomise.h
+++ b/test/randomise.h
@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_RANDOMISE_H_
+#define TEST_RANDOMISE_H_
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+
+namespace libvpx_test {
+
+// TODO(any): Replace this when built with C++11
+#define STATIC_ASSERT_INTEGER_TYPE_(T) \
+  GTEST_COMPILE_ASSERT_(std::numeric_limits<T>::is_integer, \
+    integer_type_required);
+
+/**
+ * Deterministic random number generator with various convenience methods.
+ */
+class Randomise {
+ public:
+  Randomise() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual ~Randomise() { }
+
+  // Uniformly distributed random number from the range
+  // [std::numeric_limits<R>::min(), and std::numeric_limits<R>::max()]
+  template<typename R>
+  R uniform() {
+    STATIC_ASSERT_INTEGER_TYPE_(R);
+  }
+
+  // Uniformly distributed random number from the range
+  // [0, hi)
+  template<typename R, typename H>
+  R uniform(H hi) {
+    assert(hi > 0);
+    R v = uniform<R>();
+    if (std::numeric_limits<R>::is_signed && v < 0)
+      return -v % hi;
+    else
+      return v % hi;
+  }
+
+  // Uniformly distributed random number from the range
+  // [lo, hi)
+  template<typename R, typename L, typename H>
+  R uniform(L lo, H hi) {
+    assert(hi > lo);
+    return uniform<R, H>(hi - lo) + lo;
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1) {
+    switch (uniform<int>(2)) {
+      case 0: return v0;
+      default: return v1;
+    }
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1, T v2) {
+    switch (uniform<int>(3)) {
+      case 0: return v0;
+      case 1: return v1;
+      default: return v2;
+    }
+  }
+
+  template<typename T>
+  void operator()(T &e) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T>();
+  }
+
+  template<typename T, typename H>
+  void operator()(T &e, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, H>(hi);
+  }
+
+  template<typename T, typename L, typename H>
+  void operator()(T &e, L lo, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, L, H>(lo, hi);
+  }
+
+  template<typename T, size_t n>
+  void operator()(T (&arr)[n]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T>();
+    }
+  }
+
+  template<typename T, size_t n, typename H>
+  void operator()(T (&arr)[n], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, H>(hi);
+    }
+  }
+
+  template<typename T, size_t n, typename L, typename H>
+  void operator()(T (&arr)[n], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, L, H>(lo, hi);
+    }
+  }
+
+  template<typename T, size_t n, size_t m>
+  void operator()(T (&arr)[n][m]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T>();
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename H>
+  void operator()(T (&arr)[n][m], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, H>(hi);
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename L, typename H>
+  void operator()(T (&arr)[n][m], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, L, H>(lo, hi);
+      }
+    }
+  }
+
+ private:
+  libvpx_test::ACMRandom rnd_;
+};
+
+// Add further specialisations as necessary
+
+template<>
+bool Randomise::uniform<bool>() {
+  return rnd_.Rand8() & 1 ? true : false;
+}
+
+template<>
+uint8_t Randomise::uniform<uint8_t>() {
+  return rnd_.Rand8();
+}
+
+template<>
+uint16_t Randomise::uniform<uint16_t>() {
+  return rnd_.Rand16();
+}
+
+template<>
+uint32_t Randomise::uniform<uint32_t>() {
+  const uint32_t l = uniform<uint16_t>();
+  const uint32_t h = uniform<uint16_t>();
+  return h << 16 | l;
+}
+
+template<>
+uint64_t Randomise::uniform<uint64_t>() {
+  const uint64_t l = uniform<uint32_t>();
+  const uint64_t h = uniform<uint32_t>();
+  return h << 32 | l;
+}
+
+template<>
+int8_t Randomise::uniform<int8_t>() { return uniform<uint8_t>(); }
+
+template<>
+int16_t Randomise::uniform<int16_t>() { return uniform<uint16_t>(); }
+
+template<>
+int32_t Randomise::uniform<int32_t>() { return uniform<uint32_t>(); }
+
+template<>
+int64_t Randomise::uniform<int64_t>() { return uniform<uint64_t>(); }
+
+}  // namespace libvpx_test
+
+#endif  // TEST_RANDOMISE_H_
--- a/test/snapshot.h
+++ b/test/snapshot.h
@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_SNAPSHOT_H_
+#define TEST_SNAPSHOT_H_
+
+#include <map>
+
+namespace libvpx_test {
+
+/**
+ * Allows capturing and retrieving snapshots of arbitrary blobs of memory,
+ * blob size is based on compile time type information.
+ *
+ * Usage:
+ * void example() {
+ *   Snapshot snapshot;
+ *
+ *   int foo = 4;
+ *
+ *   snapshot(foo);
+ *
+ *   foo = 10;
+ *
+ *   assert(snapshot.get(foo) == 4);     // Pass
+ *   assert(snapshot.get(foo) == foo);   // Fail (4 != 10)
+ *
+ *   char bar[10][10];
+ *   memset(bar, 3, sizeof(bar));
+ *
+ *   snapshot(bar);
+ *
+ *   memset(bar, 8, sizeof(bar));
+ *
+ *   assert(sum(bar) == 800);                 // Pass
+ *   assert(sum(snapshot.get(bar)) == 300);   // Pass
+ * }
+ */
+class Snapshot {
+ public:
+  virtual ~Snapshot() {
+    for (snapshot_map_t::iterator it = snapshots_.begin();
+         it != snapshots_.end(); it++) {
+      delete[] it->second;
+    }
+  }
+
+  /**
+   * Take new snapshot for object
+   */
+  template<typename E>
+  void take(const E &e) {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::iterator it = snapshots_.find(key);
+
+    if (it != snapshots_.end())
+      delete[] it->second;
+
+    char *const buf = new char[sizeof(E)];
+
+    memcpy(buf, &e, sizeof(E));
+
+    snapshots_[key] = buf;
+  }
+
+  /**
+   * Same as 'take'
+   */
+  template<typename E>
+  void operator()(const E &e) {
+    take(e);
+  }
+
+  /**
+   * Retrieve last snapshot for object
+   */
+  template<typename E>
+  const E& get(const E &e) const {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::const_iterator it = snapshots_.find(key);
+
+    assert(it != snapshots_.end());
+
+    return *reinterpret_cast<const E*>(it->second);
+  }
+
+ private:
+  typedef std::map<const void*, const char*> snapshot_map_t;
+
+  snapshot_map_t snapshots_;
+};
+
+}   // namespace libvpx_test
+
+#endif  // TEST_SNAPSHOT_H_
--- a/test/test.mk
+++ b/test/test.mk
@ -178,6 +178,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
 endif

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@ -11,6 +11,7 @@
 #include <assert.h>

 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"
@ -410,119 +411,6 @@ const uint8_t *vp10_get_soft_mask(int wedge_index,
  return mask;
 }

-static void build_masked_compound(uint8_t *dst, int dst_stride,
-                                  uint8_t *dst1, int dst1_stride,
-                                  uint8_t *dst2, int dst2_stride,
-                                  const uint8_t *mask,
-                                  int h, int w, int subh, int subw) {
-  int i, j;
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
-                                         uint8_t *dst1_8, int dst1_stride,
-                                         uint8_t *dst2_8, int dst2_stride,
-                                         const uint8_t *mask,
-                                         int h, int w, int subh, int subw) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8);
-  uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_SUPERTX
 static void build_masked_compound_wedge_extend(
@ -537,9 +425,11 @@ static void build_masked_compound_wedge_extend(
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = vp10_get_soft_mask(
     wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  dst, dst_stride,
+                  dst2, dst2_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }

 #if CONFIG_VP9_HIGHBITDEPTH
@ -549,14 +439,16 @@ static void build_masked_compound_wedge_extend_highbd(
    int wedge_index, int wedge_sign,
    BLOCK_SIZE sb_type,
    int wedge_offset_x, int wedge_offset_y,
-    int h, int w) {
+    int h, int w, int bd) {
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = vp10_get_soft_mask(
      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         dst_8, dst_stride,
+                         dst2_8, dst2_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

@ -573,9 +465,11 @@ static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                           sb_type, 0, 0);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  dst, dst_stride,
+                  dst2, dst2_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }

 #if CONFIG_VP9_HIGHBITDEPTH
@ -583,16 +477,18 @@ static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
                                               uint8_t *dst2_8, int dst2_stride,
                                               int wedge_index, int wedge_sign,
                                               BLOCK_SIZE sb_type,
-                                               int h, int w) {
+                                               int h, int w, int bd) {
  // Derive subsampling from h and w passed in. May be refactored to
  // pass in subsampling factors directly.
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                           sb_type, 0, 0);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         dst_8, dst_stride,
+                         dst2_8, dst2_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_SUPERTX
@ -641,7 +537,7 @@ void vp10_make_masked_inter_predictor(
        mi->mbmi.interinter_wedge_index,
        mi->mbmi.interinter_wedge_sign,
        mi->mbmi.sb_type,
-        wedge_offset_x, wedge_offset_y, h, w);
+        wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
  else
    build_masked_compound_wedge_extend(
        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@ -655,7 +551,7 @@ void vp10_make_masked_inter_predictor(
        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
        mi->mbmi.interinter_wedge_index,
        mi->mbmi.interinter_wedge_sign,
-        mi->mbmi.sb_type, h, w);
+        mi->mbmi.sb_type, h, w, xd->cur_buf->bit_depth);
  else
    build_masked_compound_wedge(
        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@ -1872,10 +1768,11 @@ static void combine_interintra(INTERINTRA_MODE mode,
                                               bsize, 0, 0);
      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      build_masked_compound(comppred, compstride,
-                            intrapred, intrastride,
-                            interpred, interstride, mask,
-                            bh, bw, subh, subw);
+      vpx_blend_mask6(comppred, compstride,
+                      intrapred, intrastride,
+                      interpred, interstride,
+                      mask, MASK_MASTER_STRIDE,
+                      bh, bw, subh, subw);
    }
    return;
  }
@ -1995,7 +1892,6 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode,
  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
  uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
  uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
-  (void) bd;

  if (use_wedge_interintra) {
    if (is_interintra_wedge_used(bsize)) {
@ -2003,10 +1899,11 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode,
                                               bsize, 0, 0);
      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      build_masked_compound_highbd(comppred8, compstride,
-                                   intrapred8, intrastride,
-                                   interpred8, interstride, mask,
-                                   bh, bw, subh, subw);
+      vpx_highbd_blend_mask6(comppred8, compstride,
+                             intrapred8, intrastride,
+                             interpred8, interstride,
+                             mask, MASK_MASTER_STRIDE,
+                             bh, bw, subh, subw, bd);
    }
    return;
  }
@ -2460,7 +2357,7 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.interinter_wedge_sign,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
      } else {
        build_masked_compound_wedge_extend(
            dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
@ -2484,7 +2381,8 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
                                           MAX_SB_SIZE,
                                           mi->mbmi.interinter_wedge_index,
                                           mi->mbmi.interinter_wedge_sign,
-                                           mi->mbmi.sb_type, h, w);
+                                           mi->mbmi.sb_type, h, w,
+                                           xd->cur_buf->bit_depth);
      else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
        build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
--- a/vpx_dsp/blend_mask6.c
+++ b/vpx_dsp/blend_mask6.c
@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
+                              uint8_t *src0_8, uint32_t src0_stride,
+                              uint8_t *src1_8, uint32_t src1_stride,
+                              const uint8_t *mask, uint32_t mask_stride,
+                              int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -65,6 +65,15 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c

+# inter predictions
+
+ifeq ($(CONFIG_VP10),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-yes            += blend_mask6.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
+endif  #CONFIG_EXT_INTER
+endif  #CONFIG_VP10
+
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@ -32,6 +32,8 @@ extern "C" {

 #define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')

+#define IS_POWER_OF_TWO(x)  (((x) & ((x) - 1)) == 0)
+
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -1358,10 +1358,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  }
 }  # CONFIG_VP9_HIGHBITDEPTH

+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
 #
 # Masked Variance / Masked Subpixel Variance
 #
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
  foreach (@block_sizes) {
    ($w, $h) = @$_;
    add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
@ -1381,6 +1381,14 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
      }
    }
  }
+
+  add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  specialize "vpx_blend_mask6", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
+  }
 }

 #
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_mask6_sse4.c