Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics.

Change-Id: Ib4f5dd733eb2939b108070a01e83da5d9990bac0
This commit is contained in:
Dmitry Kovalev 2014-09-06 00:10:25 -07:00
Родитель 89963bf586
Коммит 1f19ebbab6
5 изменённых файлов: 80 добавлений и 71 удалений

Просмотреть файл

@ -35,6 +35,14 @@ using ::std::tr1::make_tuple;
using ::std::tr1::tuple; using ::std::tr1::tuple;
using libvpx_test::ACMRandom; using libvpx_test::ACMRandom;
static unsigned int mb_ss_ref(const int16_t *src) {
unsigned int res = 0;
for (int i = 0; i < 256; ++i) {
res += src[i] * src[i];
}
return res;
}
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, unsigned int *sse_ptr) { int l2w, int l2h, unsigned int *sse_ptr) {
int se = 0; int se = 0;
@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
return sse - (((int64_t) se * se) >> (l2w + l2h)); return sse - (((int64_t) se * se) >> (l2w + l2h));
} }
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
public:
SumOfSquaresTest() : func_(GetParam()) {}
virtual ~SumOfSquaresTest() {
libvpx_test::ClearSystemState();
}
protected:
void ConstTest();
void RefTest();
SumOfSquaresFunction func_;
ACMRandom rnd_;
};
void SumOfSquaresTest::ConstTest() {
int16_t mem[256];
unsigned int res;
for (int v = 0; v < 256; ++v) {
for (int i = 0; i < 256; ++i) {
mem[i] = v;
}
ASM_REGISTER_STATE_CHECK(res = func_(mem));
EXPECT_EQ(256u * (v * v), res);
}
}
void SumOfSquaresTest::RefTest() {
int16_t mem[256];
for (int i = 0; i < 100; ++i) {
for (int j = 0; j < 256; ++j) {
mem[j] = rnd_.Rand8() - rnd_.Rand8();
}
const unsigned int expected = mb_ss_ref(mem);
unsigned int res;
ASM_REGISTER_STATE_CHECK(res = func_(mem));
EXPECT_EQ(expected, res);
}
}
template<typename VarianceFunctionType> template<typename VarianceFunctionType>
class VarianceTest class VarianceTest
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > { : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P(
namespace vp9 { namespace vp9 {
#if CONFIG_VP9_ENCODER #if CONFIG_VP9_ENCODER
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
::testing::Values(vp9_get_mb_ss_c));
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest; typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest; typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest; typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
@ -487,6 +546,10 @@ INSTANTIATE_TEST_CASE_P(
#if HAVE_SSE2 #if HAVE_SSE2
#if CONFIG_USE_X86INC #if CONFIG_USE_X86INC
INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
::testing::Values(vp9_get_mb_ss_sse2));
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;

Просмотреть файл

@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0; unsigned int i, sum = 0;
for (i = 0; i < 256; i++) for (i = 0; i < 256; ++i) {
sum += src_ptr[i] * src_ptr[i]; sum += src_ptr[i] * src_ptr[i];
}
return sum; return sum;
} }

Просмотреть файл

@ -1,69 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp9_get_mb_ss_sse2
;(
; short *src_ptr
;)
global sym(vp9_get_mb_ss_sse2) PRIVATE
sym(vp9_get_mb_ss_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 1
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rax, arg(0) ;[src_ptr]
mov rcx, 8
pxor xmm4, xmm4
.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
movdqa xmm3, [rax+48]
pmaddwd xmm0, xmm0
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm0, xmm1
paddd xmm2, xmm3
paddd xmm4, xmm0
paddd xmm4, xmm2
add rax, 0x40
dec rcx
ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
paddd xmm4,xmm3
movdqa xmm3,xmm4
psrldq xmm4,4
paddd xmm4,xmm3
movq rax,xmm4
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -19,6 +19,21 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride, const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum); unsigned int *sse, int *sum);
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
__m128i vsum = _mm_setzero_si128();
int i;
for (i = 0; i < 32; ++i) {
const __m128i v = _mm_loadu_si128((const __m128i *)src);
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
src += 8;
}
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
return _mm_cvtsi128_si32(vsum);
}
#define READ64(p, stride, i) \ #define READ64(p, stride, i) \
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))

Просмотреть файл

@ -93,7 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c