From 1f19ebbab612c2a382b98787a85f2825279ef42a Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Sat, 6 Sep 2014 00:10:25 -0700 Subject: [PATCH] Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics. Change-Id: Ib4f5dd733eb2939b108070a01e83da5d9990bac0 --- test/variance_test.cc | 63 ++++++++++++++++++++ vp9/encoder/vp9_variance.c | 3 +- vp9/encoder/x86/vp9_variance_impl_sse2.asm | 69 ---------------------- vp9/encoder/x86/vp9_variance_sse2.c | 15 +++++ vp9/vp9cx.mk | 1 - 5 files changed, 80 insertions(+), 71 deletions(-) delete mode 100644 vp9/encoder/x86/vp9_variance_impl_sse2.asm diff --git a/test/variance_test.cc b/test/variance_test.cc index b1354703d..f76402e4b 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -35,6 +35,14 @@ using ::std::tr1::make_tuple; using ::std::tr1::tuple; using libvpx_test::ACMRandom; +static unsigned int mb_ss_ref(const int16_t *src) { + unsigned int res = 0; + for (int i = 0; i < 256; ++i) { + res += src[i] * src[i]; + } + return res; +} + static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, int l2w, int l2h, unsigned int *sse_ptr) { int se = 0; @@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, return sse - (((int64_t) se * se) >> (l2w + l2h)); } +typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); + +class SumOfSquaresTest : public ::testing::TestWithParam { + public: + SumOfSquaresTest() : func_(GetParam()) {} + + virtual ~SumOfSquaresTest() { + libvpx_test::ClearSystemState(); + } + + protected: + void ConstTest(); + void RefTest(); + + SumOfSquaresFunction func_; + ACMRandom rnd_; +}; + +void SumOfSquaresTest::ConstTest() { + int16_t mem[256]; + unsigned int res; + for (int v = 0; v < 256; ++v) { + for (int i = 0; i < 256; ++i) { + mem[i] = v; + } + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(256u * (v * v), res); + } +} + +void SumOfSquaresTest::RefTest() { + int16_t mem[256]; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 256; ++j) { + mem[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const unsigned int expected = mb_ss_ref(mem); + unsigned int res; + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(expected, res); + } +} + template class VarianceTest : public ::testing::TestWithParam > { @@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P( namespace vp9 { #if CONFIG_VP9_ENCODER + +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } + +INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, + ::testing::Values(vp9_get_mb_ss_c)); + typedef VarianceTest VP9VarianceTest; typedef SubpelVarianceTest VP9SubpelVarianceTest; typedef SubpelVarianceTest VP9SubpelAvgVarianceTest; @@ -487,6 +546,10 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_SSE2 #if CONFIG_USE_X86INC + +INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vp9_get_mb_ss_sse2)); + const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index eb5ae2e41..afbb191ad 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; - for (i = 0; i < 256; i++) + for (i = 0; i < 256; ++i) { sum += src_ptr[i] * src_ptr[i]; + } return sum; } diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm deleted file mode 100644 index 6278f2a78..000000000 --- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp9_get_mb_ss_sse2) PRIVATE -sym(vp9_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 1cc4bbca6..b4d2b0ac4 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -19,6 +19,21 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse, int *sum); +unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + #define READ64(p, stride, i) \ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 312f71700..e450f7b7f 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -93,7 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c