From a661bc87c40c95ccaa61c05d325760ed0224a6db Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Fri, 20 May 2016 16:33:12 +0100 Subject: [PATCH] Add optimized vpx_blend_mask6 This is to replace vp10/common/reconinter.c:build_masked_compound. Functionality is equivalent, but the interface is slightly more generic. Total encoder speedup with ext-inter: ~7.5% Change-Id: Iee18b83ae324ffc9c7f7dc16d4b2b06adb4d4305 --- test/assertion_helpers.h | 278 ++++++++ test/blend_mask6_test.cc | 311 ++++++++ test/function_equivalence_test.h | 40 ++ test/randomise.h | 207 ++++++ test/snapshot.h | 104 +++ test/test.mk | 1 + vp10/common/reconinter.c | 178 +---- vpx_dsp/blend_mask6.c | 152 ++++ vpx_dsp/vpx_dsp.mk | 9 + vpx_dsp/vpx_dsp_common.h | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 10 +- vpx_dsp/x86/blend_mask6_sse4.c | 1146 ++++++++++++++++++++++++++++++ 12 files changed, 2297 insertions(+), 141 deletions(-) create mode 100644 test/assertion_helpers.h create mode 100644 test/blend_mask6_test.cc create mode 100644 test/function_equivalence_test.h create mode 100644 test/randomise.h create mode 100644 test/snapshot.h create mode 100644 vpx_dsp/blend_mask6.c create mode 100644 vpx_dsp/x86/blend_mask6_sse4.c diff --git a/test/assertion_helpers.h b/test/assertion_helpers.h new file mode 100644 index 000000000..108c40a94 --- /dev/null +++ b/test/assertion_helpers.h @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef TEST_ASSERTION_HELPERS_H_ +#define TEST_ASSERTION_HELPERS_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace libvpx_test { +namespace assertion_helpers { + +// Arrays (1D) are element-wise equal +template +::testing::AssertionResult ArraysEq(const E (&a)[n], + const E (&b)[n]) { + for (size_t i = 0; i < n; i++) { + const E &va = a[i]; + const E &vb = b[i]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "]" + << " values are: " << va << " vs " << vb; + } + } + + return ::testing::AssertionSuccess(); +} + +// Arrays (1D) are element-wise equal +// within the index interval [lo, hi) +template +::testing::AssertionResult ArraysEqWithin(const E (&a)[n], + const E (&b)[n], + const size_t lo, + const size_t hi) { + assert(hi > lo); + assert(hi <= n); + + for (size_t i = lo; i < hi; i++) { + const E &va = a[i]; + const E &vb = b[i]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "]" + << " values are: " << va << " vs " << vb; + } + } + + return ::testing::AssertionSuccess(); +} + +// Arrays (1D) are element-wise equal +// outside the index interval [lo, hi) +template +::testing::AssertionResult ArraysEqOutside(const E (&a)[n], + const E (&b)[n], + const size_t lo, + const size_t hi) { + assert(hi > lo); + assert(hi <= n); + + for (size_t i = 0; i < n; i++) { + if (lo <= i && i < hi) + continue; + + const E &va = a[i]; + const E &vb = b[i]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "]" + << " values are: " << va << " vs " << vb; + } + } + + return ::testing::AssertionSuccess(); +} + +// Arrays (2D) are element-wise equal +template +::testing::AssertionResult ArraysEq(const E (&a)[n][m], + const E (&b)[n][m]) { + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < m; j++) { + const E &va = a[i][j]; + const E &vb = b[i][j]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "][" << j << "]" + << " values are: " << va << " vs " << vb; + } + } + } + + return ::testing::AssertionSuccess(); +} + +// Arrays (2D) are element-wise equal +// within the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product) +template +::testing::AssertionResult ArraysEqWithin(const E (&a)[n][m], + const E (&b)[n][m], + const size_t lo0, + const size_t hi0, + const size_t lo1, + const size_t hi1) { + assert(hi0 > lo0); + assert(hi0 <= n); + assert(hi1 > lo1); + assert(hi1 <= m); + + for (size_t i = lo0; i < hi0; i++) { + for (size_t j = lo1; j < hi1; j++) { + const E &va = a[i][j]; + const E &vb = b[i][j]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "][" << j << "]" + << " values are: " << va << " vs " << vb; + } + } + } + + return ::testing::AssertionSuccess(); +} + +// Arrays (2D) are element-wise equal +// outside the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product) +template +::testing::AssertionResult ArraysEqOutside(const E (&a)[n][m], + const E (&b)[n][m], + const size_t lo0, + const size_t hi0, + const size_t lo1, + const size_t hi1) { + assert(hi0 > lo0); + assert(hi0 <= n); + assert(hi1 > lo1); + assert(hi1 <= m); + + for (size_t i = 0; i < n; i++) { + if (lo0 <= i && i < hi0) + continue; + + for (size_t j = 0; j < m; j++) { + if (lo1 <= j && j < hi1) + continue; + + const E &va = a[i][j]; + const E &vb = b[i][j]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << i << "][" << j << "]" + << " values are: " << va << " vs " << vb; + } + } + } + + return ::testing::AssertionSuccess(); +} + +// Non contiguous 2D array buffers are element-wise equal +// at corresponding linear indices specified by rows/cols/stride/offset +template +::testing::AssertionResult BuffersEqWithin(const E (&a)[n][m], + const E (&b)[n][m], + const size_t stridea, + const size_t strideb, + const size_t offseta, + const size_t offsetb, + const size_t rows, + const size_t cols) { + assert(rows <= n); + assert(cols <= m); + assert(stridea <= m); + assert(strideb <= m); + assert(cols <= stridea); + assert(cols <= strideb); + assert(offseta < n * m); + assert(offsetb < n * m); + assert(offseta + (rows - 1) * stridea + (cols - 1) < n * m); + assert(offsetb + (rows - 1) * strideb + (cols - 1) < n * m); + + const E *pa = &a[0][0] + offseta; + const E *pb = &b[0][0] + offsetb; + + for (size_t r = 0 ; r < rows ; r++) { + for (size_t c = 0 ; c < cols ; c++) { + const E &va = pa[c]; + const E &vb = pb[c]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at linear index " + << "[" << pa - &a[0][0] << "] vs [" << pb - &b[0][0] << "]" + << " row=" << r << " col=" << c + << " values are: " << va << " vs " << vb; + } + } + pa += stridea; + pb += strideb; + } + + return ::testing::AssertionSuccess(); +} + +// Non contiguous 2D array buffers are element-wise equal +// except at corresponding linear indices specified by +// rows/cols/stride/offset. +template +::testing::AssertionResult BuffersEqOutside(const E (&a)[n][m], + const E (&b)[n][m], + const size_t stride, + const size_t offset, + const size_t rows, + const size_t cols ) { + assert(rows <= n); + assert(cols <= m); + assert(stride <= m); + assert(cols <= stride); + assert(offset < n * m); + assert(offset + (rows - 1) * stride + (cols - 1) < n * m); + + const E *const pa = &a[0][0]; + const E *const pb = &b[0][0]; + + size_t idx = 0; + size_t r = 0; + size_t end = offset; // beginning of first row + + while (idx < n * m) { + while (idx < end) { // until beginning of row or end of buffer + const E &va = pa[idx]; + const E &vb = pb[idx]; + if (va != vb) { + return ::testing::AssertionFailure() + << "Arrays do not equal at index " + << "[" << idx / m << "][" << idx % m << "]" + << " values are: " << va << " vs " << vb; + } + + idx++; + } + + // Move past row end + idx += cols; + + if (++r < rows) { + // Move to next row + end += stride; + } else { + // Move to end of buffer + end = n * m; + } + } + + // Sanity check + assert(idx == n * m + cols); + + return ::testing::AssertionSuccess(); +} + +} // namespace assertion_helpers +} // namespace libvpx_test + +#endif // TEST_ASSERTION_HELPERS_H_ diff --git a/test/blend_mask6_test.cc b/test/blend_mask6_test.cc new file mode 100644 index 000000000..d737dddb5 --- /dev/null +++ b/test/blend_mask6_test.cc @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/register_state_check.h" + +#include "test/function_equivalence_test.h" +#include "test/randomise.h" +#include "test/snapshot.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +#include "./vp10_rtcd.h" + +#include "test/assertion_helpers.h" +#include "vp10/common/enums.h" + +using libvpx_test::assertion_helpers::BuffersEqWithin; +using libvpx_test::assertion_helpers::BuffersEqOutside; +using libvpx_test::assertion_helpers::ArraysEq; +using libvpx_test::FunctionEquivalenceTest; +using libvpx_test::Snapshot; +using libvpx_test::Randomise; +using std::tr1::make_tuple; + +namespace { + +template +class BlendMask6Test : public FunctionEquivalenceTest { + protected: + virtual ~BlendMask6Test() {} + + virtual void Execute(T *p_src0, T *p_src1) = 0; + + void Common() { + w = 1 << randomise.uniform(2, MAX_SB_SIZE_LOG2 + 1); + h = 1 << randomise.uniform(2, MAX_SB_SIZE_LOG2 + 1); + + randomise(subx); + randomise(suby); + + randomise(dst_offset, 0, 32); + randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1); + + randomise(src0_offset, 0, 32); + randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1); + + randomise(src1_offset, 0, 32); + randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1); + + randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1); + + T *p_src0; + T *p_src1; + + switch (randomise.uniform(3)) { + case 0: // Separate sources + p_src0 = &src0[0][0]; + p_src1 = &src1[0][0]; + break; + case 1: // src0 == dst + p_src0 = &dst_tst[0][0]; + src0_stride = dst_stride; + src0_offset = dst_offset; + p_src1 = &src1[0][0]; + break; + case 2: // src1 == dst + p_src0 = &src0[0][0]; + p_src1 = &dst_tst[0][0]; + src1_stride = dst_stride; + src1_offset = dst_offset; + break; + default: + FAIL(); + } + + ////////////////////////////////////////////////////////////////////////// + // Prepare + ////////////////////////////////////////////////////////////////////////// + + snapshot(dst_ref); + snapshot(dst_tst); + + snapshot(src0); + snapshot(src1); + + snapshot(mask); + + ////////////////////////////////////////////////////////////////////////// + // Execute + ////////////////////////////////////////////////////////////////////////// + + Execute(p_src0, p_src1); + + ////////////////////////////////////////////////////////////////////////// + // Check + ////////////////////////////////////////////////////////////////////////// + + ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst, + dst_stride, dst_stride, + dst_offset, dst_offset, + h, w)); + + ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0)); + ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1)); + ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask)); + + ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref, + dst_stride, + dst_offset, + h, w)); + + ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst, + dst_stride, + dst_offset, + h, w)); + } + + Snapshot snapshot; + Randomise randomise; + + T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5]; + T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5]; + size_t dst_stride; + size_t dst_offset; + + T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5]; + size_t src0_stride; + size_t src0_offset; + + T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5]; + size_t src1_stride; + size_t src1_offset; + + uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE]; + size_t mask_stride; + + int w; + int h; + + bool suby; + bool subx; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx); + +class BlendMask6Test8B : public BlendMask6Test { + protected: + void Execute(uint8_t *p_src0, uint8_t *p_src1) { + ref_func_(&dst_ref[0][dst_offset], dst_stride, + p_src0 + src0_offset, src0_stride, + p_src1 + src1_offset, src1_stride, + &mask[0][0], sizeof(mask[0]), + h, w, suby, subx); + + ASM_REGISTER_STATE_CHECK( + tst_func_(&dst_tst[0][dst_offset], dst_stride, + p_src0 + src0_offset, src0_stride, + p_src1 + src1_offset, src1_stride, + &mask[0][0], sizeof(mask[0]), + h, w, suby, subx)); + } +}; + +TEST_P(BlendMask6Test8B, RandomValues) { + for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) { + ////////////////////////////////////////////////////////////////////////// + // Randomise + ////////////////////////////////////////////////////////////////////////// + + randomise(dst_ref); + randomise(dst_tst); + + randomise(src0); + randomise(src1); + + randomise(mask, 65); + + Common(); + } +} + +TEST_P(BlendMask6Test8B, ExtremeValues) { + for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) { + ////////////////////////////////////////////////////////////////////////// + // Randomise + ////////////////////////////////////////////////////////////////////////// + + randomise(dst_ref, 254, 256); + randomise(dst_tst, 254, 256); + + randomise(src0, 254, 256); + randomise(src1, 254, 256); + + randomise(mask, 63, 65); + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1_C_COMPARE, BlendMask6Test8B, + ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1))); +#endif // HAVE_SSE4_1 + +#if CONFIG_VP9_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx, int bd); + +class BlendMask6TestHBD : public BlendMask6Test { + protected: + void Execute(uint16_t *p_src0, uint16_t *p_src1) { + ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride, + &mask[0][0], sizeof(mask[0]), + h, w, suby, subx, bit_depth); + + ASM_REGISTER_STATE_CHECK( + tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride, + &mask[0][0], sizeof(mask[0]), + h, w, suby, subx, bit_depth)); + } + + int bit_depth; +}; + +TEST_P(BlendMask6TestHBD, RandomValues) { + for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) { + ////////////////////////////////////////////////////////////////////////// + // Randomise + ////////////////////////////////////////////////////////////////////////// + + bit_depth = randomise.choice(8, 10, 12); + + const int hi = 1 << bit_depth; + + randomise(dst_ref, hi); + randomise(dst_tst, hi); + + randomise(src0, hi); + randomise(src1, hi); + + randomise(mask, 65); + + Common(); + } +} + +TEST_P(BlendMask6TestHBD, ExtremeValues) { + for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) { + ////////////////////////////////////////////////////////////////////////// + // Randomise + ////////////////////////////////////////////////////////////////////////// + + bit_depth = randomise.choice(8, 10, 12); + + const int hi = 1 << bit_depth; + const int lo = hi - 2; + + randomise(dst_ref, lo, hi); + randomise(dst_tst, lo, hi); + + randomise(src0, lo, hi); + randomise(src1, lo, hi); + + randomise(mask, 63, 65); + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1_C_COMPARE, BlendMask6TestHBD, + ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c, + &vpx_highbd_blend_mask6_sse4_1))); +#endif // HAVE_SSE4_1 +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h new file mode 100644 index 000000000..50ad4c560 --- /dev/null +++ b/test/function_equivalence_test.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_ +#define TEST_FUNCTION_EQUIVALENCE_TEST_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/clear_system_state.h" +#include "test/util.h" + +namespace libvpx_test { +template +class FunctionEquivalenceTest : + public ::testing::TestWithParam< std::tr1::tuple< T, T > > { + public: + virtual ~FunctionEquivalenceTest() {} + + virtual void SetUp() { + ref_func_ = std::tr1::get<0>(this->GetParam()); + tst_func_ = std::tr1::get<1>(this->GetParam()); + } + + virtual void TearDown() { + libvpx_test::ClearSystemState(); + } + + protected: + T ref_func_; + T tst_func_; +}; + +} // namespace libvpx_test +#endif // TEST_FUNCTION_EQUIVALENCE_TEST_H_ diff --git a/test/randomise.h b/test/randomise.h new file mode 100644 index 000000000..fbf419c68 --- /dev/null +++ b/test/randomise.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_RANDOMISE_H_ +#define TEST_RANDOMISE_H_ + +#include + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" + +namespace libvpx_test { + +// TODO(any): Replace this when built with C++11 +#define STATIC_ASSERT_INTEGER_TYPE_(T) \ + GTEST_COMPILE_ASSERT_(std::numeric_limits::is_integer, \ + integer_type_required); + +/** + * Deterministic random number generator with various convenience methods. + */ +class Randomise { + public: + Randomise() { + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + virtual ~Randomise() { } + + // Uniformly distributed random number from the range + // [std::numeric_limits::min(), and std::numeric_limits::max()] + template + R uniform() { + STATIC_ASSERT_INTEGER_TYPE_(R); + } + + // Uniformly distributed random number from the range + // [0, hi) + template + R uniform(H hi) { + assert(hi > 0); + R v = uniform(); + if (std::numeric_limits::is_signed && v < 0) + return -v % hi; + else + return v % hi; + } + + // Uniformly distributed random number from the range + // [lo, hi) + template + R uniform(L lo, H hi) { + assert(hi > lo); + return uniform(hi - lo) + lo; + } + + // Randomly pick and return one of the arguments + template + T choice(T v0, T v1) { + switch (uniform(2)) { + case 0: return v0; + default: return v1; + } + } + + // Randomly pick and return one of the arguments + template + T choice(T v0, T v1, T v2) { + switch (uniform(3)) { + case 0: return v0; + case 1: return v1; + default: return v2; + } + } + + template + void operator()(T &e) { // NOLINT + STATIC_ASSERT_INTEGER_TYPE_(T); + e = uniform(); + } + + template + void operator()(T &e, H hi) { // NOLINT + STATIC_ASSERT_INTEGER_TYPE_(T); + e = uniform(hi); + } + + template + void operator()(T &e, L lo, H hi) { // NOLINT + STATIC_ASSERT_INTEGER_TYPE_(T); + e = uniform(lo, hi); + } + + template + void operator()(T (&arr)[n]) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + arr[i] = uniform(); + } + } + + template + void operator()(T (&arr)[n], H hi) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + arr[i] = uniform(hi); + } + } + + template + void operator()(T (&arr)[n], L lo, H hi) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + arr[i] = uniform(lo, hi); + } + } + + template + void operator()(T (&arr)[n][m]) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + for (size_t j = 0; j < m ; j++) { + arr[i][j] = uniform(); + } + } + } + + template + void operator()(T (&arr)[n][m], H hi) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + for (size_t j = 0; j < m ; j++) { + arr[i][j] = uniform(hi); + } + } + } + + template + void operator()(T (&arr)[n][m], L lo, H hi) { + STATIC_ASSERT_INTEGER_TYPE_(T); + for (size_t i = 0; i < n ; i++) { + for (size_t j = 0; j < m ; j++) { + arr[i][j] = uniform(lo, hi); + } + } + } + + private: + libvpx_test::ACMRandom rnd_; +}; + +// Add further specialisations as necessary + +template<> +bool Randomise::uniform() { + return rnd_.Rand8() & 1 ? true : false; +} + +template<> +uint8_t Randomise::uniform() { + return rnd_.Rand8(); +} + +template<> +uint16_t Randomise::uniform() { + return rnd_.Rand16(); +} + +template<> +uint32_t Randomise::uniform() { + const uint32_t l = uniform(); + const uint32_t h = uniform(); + return h << 16 | l; +} + +template<> +uint64_t Randomise::uniform() { + const uint64_t l = uniform(); + const uint64_t h = uniform(); + return h << 32 | l; +} + +template<> +int8_t Randomise::uniform() { return uniform(); } + +template<> +int16_t Randomise::uniform() { return uniform(); } + +template<> +int32_t Randomise::uniform() { return uniform(); } + +template<> +int64_t Randomise::uniform() { return uniform(); } + +} // namespace libvpx_test + +#endif // TEST_RANDOMISE_H_ diff --git a/test/snapshot.h b/test/snapshot.h new file mode 100644 index 000000000..b67eddef7 --- /dev/null +++ b/test/snapshot.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_SNAPSHOT_H_ +#define TEST_SNAPSHOT_H_ + +#include + +namespace libvpx_test { + +/** + * Allows capturing and retrieving snapshots of arbitrary blobs of memory, + * blob size is based on compile time type information. + * + * Usage: + * void example() { + * Snapshot snapshot; + * + * int foo = 4; + * + * snapshot(foo); + * + * foo = 10; + * + * assert(snapshot.get(foo) == 4); // Pass + * assert(snapshot.get(foo) == foo); // Fail (4 != 10) + * + * char bar[10][10]; + * memset(bar, 3, sizeof(bar)); + * + * snapshot(bar); + * + * memset(bar, 8, sizeof(bar)); + * + * assert(sum(bar) == 800); // Pass + * assert(sum(snapshot.get(bar)) == 300); // Pass + * } + */ +class Snapshot { + public: + virtual ~Snapshot() { + for (snapshot_map_t::iterator it = snapshots_.begin(); + it != snapshots_.end(); it++) { + delete[] it->second; + } + } + + /** + * Take new snapshot for object + */ + template + void take(const E &e) { + const void *const key = reinterpret_cast(&e); + + snapshot_map_t::iterator it = snapshots_.find(key); + + if (it != snapshots_.end()) + delete[] it->second; + + char *const buf = new char[sizeof(E)]; + + memcpy(buf, &e, sizeof(E)); + + snapshots_[key] = buf; + } + + /** + * Same as 'take' + */ + template + void operator()(const E &e) { + take(e); + } + + /** + * Retrieve last snapshot for object + */ + template + const E& get(const E &e) const { + const void *const key = reinterpret_cast(&e); + + snapshot_map_t::const_iterator it = snapshots_.find(key); + + assert(it != snapshots_.end()); + + return *reinterpret_cast(it->second); + } + + private: + typedef std::map snapshot_map_t; + + snapshot_map_t snapshots_; +}; + +} // namespace libvpx_test + +#endif // TEST_SNAPSHOT_H_ diff --git a/test/test.mk b/test/test.mk index 77b00a5b9..339e274de 100644 --- a/test/test.mk +++ b/test/test.mk @@ -178,6 +178,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc ifeq ($(CONFIG_EXT_INTER),yes) LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc endif ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c index d6ac4bb8d..825fff390 100644 --- a/vp10/common/reconinter.c +++ b/vp10/common/reconinter.c @@ -11,6 +11,7 @@ #include #include "./vpx_scale_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -410,119 +411,6 @@ const uint8_t *vp10_get_soft_mask(int wedge_index, return mask; } -static void build_masked_compound(uint8_t *dst, int dst_stride, - uint8_t *dst1, int dst1_stride, - uint8_t *dst2, int dst2_stride, - const uint8_t *mask, - int h, int w, int subh, int subw) { - int i, j; - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = mask[i * MASK_MASTER_STRIDE + j]; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] + - mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] + - mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } else { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride, - uint8_t *dst1_8, int dst1_stride, - uint8_t *dst2_8, int dst2_stride, - const uint8_t *mask, - int h, int w, int subh, int subw) { - int i, j; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8); - uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8); - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = mask[i * MASK_MASTER_STRIDE + j]; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] + - mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] + - mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } else { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] + - mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1; - dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m + - dst2[i * dst2_stride + j] * - ((1 << WEDGE_WEIGHT_BITS) - m) + - (1 << (WEDGE_WEIGHT_BITS - 1))) >> - WEDGE_WEIGHT_BITS; - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_SUPERTX static void build_masked_compound_wedge_extend( @@ -537,9 +425,11 @@ static void build_masked_compound_wedge_extend( const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask( wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y); - build_masked_compound(dst, dst_stride, - dst, dst_stride, dst2, dst2_stride, mask, - h, w, subh, subw); + vpx_blend_mask6(dst, dst_stride, + dst, dst_stride, + dst2, dst2_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw); } #if CONFIG_VP9_HIGHBITDEPTH @@ -549,14 +439,16 @@ static void build_masked_compound_wedge_extend_highbd( int wedge_index, int wedge_sign, BLOCK_SIZE sb_type, int wedge_offset_x, int wedge_offset_y, - int h, int w) { + int h, int w, int bd) { const int subh = (2 << b_height_log2_lookup[sb_type]) == h; const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask( wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y); - build_masked_compound_highbd(dst_8, dst_stride, - dst_8, dst_stride, dst2_8, dst2_stride, mask, - h, w, subh, subw); + vpx_highbd_blend_mask6(dst_8, dst_stride, + dst_8, dst_stride, + dst2_8, dst2_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -573,9 +465,11 @@ static void build_masked_compound_wedge(uint8_t *dst, int dst_stride, const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign, sb_type, 0, 0); - build_masked_compound(dst, dst_stride, - dst, dst_stride, dst2, dst2_stride, mask, - h, w, subh, subw); + vpx_blend_mask6(dst, dst_stride, + dst, dst_stride, + dst2, dst2_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw); } #if CONFIG_VP9_HIGHBITDEPTH @@ -583,16 +477,18 @@ static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride, uint8_t *dst2_8, int dst2_stride, int wedge_index, int wedge_sign, BLOCK_SIZE sb_type, - int h, int w) { + int h, int w, int bd) { // Derive subsampling from h and w passed in. May be refactored to // pass in subsampling factors directly. const int subh = (2 << b_height_log2_lookup[sb_type]) == h; const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign, sb_type, 0, 0); - build_masked_compound_highbd(dst_8, dst_stride, - dst_8, dst_stride, dst2_8, dst2_stride, mask, - h, w, subh, subw); + vpx_highbd_blend_mask6(dst_8, dst_stride, + dst_8, dst_stride, + dst2_8, dst2_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_SUPERTX @@ -641,7 +537,7 @@ void vp10_make_masked_inter_predictor( mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign, mi->mbmi.sb_type, - wedge_offset_x, wedge_offset_y, h, w); + wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth); else build_masked_compound_wedge_extend( dst, dst_stride, tmp_dst, MAX_SB_SIZE, @@ -655,7 +551,7 @@ void vp10_make_masked_inter_predictor( dst, dst_stride, tmp_dst, MAX_SB_SIZE, mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign, - mi->mbmi.sb_type, h, w); + mi->mbmi.sb_type, h, w, xd->cur_buf->bit_depth); else build_masked_compound_wedge( dst, dst_stride, tmp_dst, MAX_SB_SIZE, @@ -1872,10 +1768,11 @@ static void combine_interintra(INTERINTRA_MODE mode, bsize, 0, 0); const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw; const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh; - build_masked_compound(comppred, compstride, - intrapred, intrastride, - interpred, interstride, mask, - bh, bw, subh, subw); + vpx_blend_mask6(comppred, compstride, + intrapred, intrastride, + interpred, interstride, + mask, MASK_MASTER_STRIDE, + bh, bw, subh, subw); } return; } @@ -1995,7 +1892,6 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8); uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8); uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8); - (void) bd; if (use_wedge_interintra) { if (is_interintra_wedge_used(bsize)) { @@ -2003,10 +1899,11 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, bsize, 0, 0); const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh; const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw; - build_masked_compound_highbd(comppred8, compstride, - intrapred8, intrastride, - interpred8, interstride, mask, - bh, bw, subh, subw); + vpx_highbd_blend_mask6(comppred8, compstride, + intrapred8, intrastride, + interpred8, interstride, + mask, MASK_MASTER_STRIDE, + bh, bw, subh, subw, bd); } return; } @@ -2460,7 +2357,7 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane, mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign, mi->mbmi.sb_type, - wedge_offset_x, wedge_offset_y, h, w); + wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth); } else { build_masked_compound_wedge_extend( dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE, @@ -2484,7 +2381,8 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane, MAX_SB_SIZE, mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign, - mi->mbmi.sb_type, h, w); + mi->mbmi.sb_type, h, w, + xd->cur_buf->bit_depth); else #endif // CONFIG_VP9_HIGHBITDEPTH build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE, diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c new file mode 100644 index 000000000..584ee6a78 --- /dev/null +++ b/vpx_dsp/blend_mask6.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#include "./vpx_dsp_rtcd.h" + +#define MASK_BITS 6 + +void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = mask[i * mask_stride + j]; + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] + + mask[i * mask_stride + (2 * j + 1)], 1); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] + + mask[(2 * i + 1) * mask_stride + j], 1); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride, + uint8_t *src0_8, uint32_t src0_stride, + uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = mask[i * mask_stride + j]; + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] + + mask[i * mask_stride + (2 * j + 1)], 1); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } else { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + const int m0 = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] + + mask[(2 * i + 1) * mask_stride + j], 1); + const int m1 = ((1 << MASK_BITS) - m0); + dst[i * dst_stride + j] = + ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + + src1[i * src1_stride + j] * m1, MASK_BITS); + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 46ef5fc38..430cae1f2 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -65,6 +65,15 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c +# inter predictions + +ifeq ($(CONFIG_VP10),yes) +ifeq ($(CONFIG_EXT_INTER),yes) +DSP_SRCS-yes += blend_mask6.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c +endif #CONFIG_EXT_INTER +endif #CONFIG_VP10 + # interpolation filters DSP_SRCS-yes += vpx_convolve.c DSP_SRCS-yes += vpx_convolve.h diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 3571eeada..7aaa89f6f 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -32,6 +32,8 @@ extern "C" { #define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') +#define IS_POWER_OF_TWO(x) (((x) & ((x) - 1)) == 0) + // These can be used to give a hint about branch outcomes. // This can have an effect, even if your target processor has a // good branch predictor, as these hints can affect basic block diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ad524a232..7bae0375d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1358,10 +1358,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } } # CONFIG_VP9_HIGHBITDEPTH +if (vpx_config("CONFIG_EXT_INTER") eq "yes") { # # Masked Variance / Masked Subpixel Variance # -if (vpx_config("CONFIG_EXT_INTER") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; @@ -1381,6 +1381,14 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") { } } } + + add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; + specialize "vpx_blend_mask6", qw/sse4_1/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd"; + specialize "vpx_highbd_blend_mask6", qw/sse4_1/; + } } # diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c new file mode 100644 index 000000000..5de3e23ca --- /dev/null +++ b/vpx_dsp/x86/blend_mask6_sse4.c @@ -0,0 +1,1146 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#include "./vpx_dsp_rtcd.h" + +#define MASK_BITS 6 + +static INLINE __m128i mm_loadl_32(const void *a) { + return _mm_cvtsi32_si128(*(const uint32_t*)a); +} + +static INLINE __m128i mm_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i*)a); +} + +static INLINE __m128i mm_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i*)a); +} + +static INLINE void mm_storel_32(void *const a, const __m128i v) { + *(uint32_t*)a = _mm_cvtsi128_si32(v); +} + +static INLINE void mm_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i*)a, v); +} + +static INLINE void mm_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i*)a, v); +} + +static INLINE __m128i mm_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = mm_loadl_32(src0); + const __m128i v_s1_b = mm_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = mm_loadl_64(src0); + const __m128i v_s1_b = mm_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS); + + return v_res_w; +} + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_mask6_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_m0_b = mm_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_m0_b = mm_loadl_64(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0l_b = mm_loadl_64(mask + c); + const __m128i v_m0h_b = mm_loadl_64(mask + c + 8); + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); + const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, + v_m0l_w, v_m1l_w); + const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, + v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + mm_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_mask6_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_r_b = mm_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_r_b = mm_loadu_128(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_rl_b = mm_loadu_128(mask + 2 * c); + const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16); + const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); + const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); + + const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); + const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, + v_m0l_w, v_m1l_w); + const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, + v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + mm_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_mask6_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_ra_b = mm_loadl_32(mask); + const __m128i v_rb_b = mm_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_ra_b = mm_loadl_64(mask); + const __m128i v_rb_b = mm_loadl_64(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zero = _mm_setzero_si128(); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = mm_loadu_128(mask + c); + const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, + v_m0l_w, v_m1l_w); + const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, + v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + mm_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_mask6_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_ra_b = mm_loadl_64(mask); + const __m128i v_rb_b = mm_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), + v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + (void)w; + + do { + const __m128i v_ra_b = mm_loadu_128(mask); + const __m128i v_rb_b = mm_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), + v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + mm_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = mm_loadu_128(mask + 2 * c); + const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), + v_zmask_b); + const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), + v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, + v_m0l_w, v_m1l_w); + const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, + v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + mm_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx) { + typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, + uint8_t *src0, uint32_t src0_stride, + uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w); + + static blend_fn blend[3][2][2] = { // width_index X subx X suby + { // w % 16 == 0 + {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1}, + {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1} + }, { // w == 4 + {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1}, + {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1} + }, { // w == 8 + {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1}, + {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1} + } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, mask_stride, + h, w); +} + +#if CONFIG_VP9_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = mm_loadl_64(src0); + const __m128i v_s1_w = mm_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = mm_loadu_128(src0); + const __m128i v_s1_w = mm_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = mm_loadl_64(src0); + const __m128i v_s1_w = mm_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = mm_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = mm_loadu_128(src0); + const __m128i v_s1_w = mm_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1); + const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = mm_round_epu16(v_pssum_d); + + return v_res_w; +} + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_mask6_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + const __m128i v_m0_b = mm_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + mm_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_mask6_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static inline void blend_mask6_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = mm_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + mm_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_mask6_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_mask6_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + const __m128i v_r_b = mm_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + mm_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_mask6_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_mask6_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = mm_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + mm_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_mask6_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_mask6_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + const __m128i v_ra_b = mm_loadl_32(mask); + const __m128i v_rb_b = mm_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + mm_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_mask6_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_mask6_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = mm_loadl_64(mask + c); + const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + mm_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_mask6_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + const __m128i v_ra_b = mm_loadl_64(mask); + const __m128i v_rb_b = mm_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), + v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + mm_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_mask6_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + (void)w; + blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, + 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = mm_loadu_128(mask + 2 * c); + const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), + v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + mm_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_mask6_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_mask6_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + uint8_t *src0_8, uint32_t src0_stride, + uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx, int bd) { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, + uint16_t *src0, uint32_t src0_stride, + uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w); + + static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby + { // bd == 8 or 10 + { // w % 8 == 0 + {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1}, + {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1} + }, { // w == 4 + {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1}, + {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1} + } + }, + { // bd == 12 + { // w % 8 == 0 + {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1}, + {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1} + }, { // w == 4 + {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1}, + {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1} + } + } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, mask_stride, + h, w); +} +#endif // CONFIG_VP9_HIGHBITDEPTH