Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics.
Change-Id: Ib4f5dd733eb2939b108070a01e83da5d9990bac0
This commit is contained in:
Родитель
89963bf586
Коммит
1f19ebbab6
|
@ -35,6 +35,14 @@ using ::std::tr1::make_tuple;
|
|||
using ::std::tr1::tuple;
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
static unsigned int mb_ss_ref(const int16_t *src) {
|
||||
unsigned int res = 0;
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
res += src[i] * src[i];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
|
||||
int l2w, int l2h, unsigned int *sse_ptr) {
|
||||
int se = 0;
|
||||
|
@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
|
|||
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||
}
|
||||
|
||||
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
|
||||
|
||||
class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
|
||||
public:
|
||||
SumOfSquaresTest() : func_(GetParam()) {}
|
||||
|
||||
virtual ~SumOfSquaresTest() {
|
||||
libvpx_test::ClearSystemState();
|
||||
}
|
||||
|
||||
protected:
|
||||
void ConstTest();
|
||||
void RefTest();
|
||||
|
||||
SumOfSquaresFunction func_;
|
||||
ACMRandom rnd_;
|
||||
};
|
||||
|
||||
void SumOfSquaresTest::ConstTest() {
|
||||
int16_t mem[256];
|
||||
unsigned int res;
|
||||
for (int v = 0; v < 256; ++v) {
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
mem[i] = v;
|
||||
}
|
||||
ASM_REGISTER_STATE_CHECK(res = func_(mem));
|
||||
EXPECT_EQ(256u * (v * v), res);
|
||||
}
|
||||
}
|
||||
|
||||
void SumOfSquaresTest::RefTest() {
|
||||
int16_t mem[256];
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (int j = 0; j < 256; ++j) {
|
||||
mem[j] = rnd_.Rand8() - rnd_.Rand8();
|
||||
}
|
||||
|
||||
const unsigned int expected = mb_ss_ref(mem);
|
||||
unsigned int res;
|
||||
ASM_REGISTER_STATE_CHECK(res = func_(mem));
|
||||
EXPECT_EQ(expected, res);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VarianceFunctionType>
|
||||
class VarianceTest
|
||||
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
|
||||
|
@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P(
|
|||
namespace vp9 {
|
||||
|
||||
#if CONFIG_VP9_ENCODER
|
||||
|
||||
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
|
||||
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
|
||||
::testing::Values(vp9_get_mb_ss_c));
|
||||
|
||||
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
||||
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
||||
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
|
||||
|
@ -487,6 +546,10 @@ INSTANTIATE_TEST_CASE_P(
|
|||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_USE_X86INC
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
|
||||
::testing::Values(vp9_get_mb_ss_sse2));
|
||||
|
||||
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
|
||||
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
|
||||
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
|
||||
|
|
|
@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
|
|||
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
|
||||
unsigned int i, sum = 0;
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
for (i = 0; i < 256; ++i) {
|
||||
sum += src_ptr[i] * src_ptr[i];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
|
|
@ -1,69 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vp9_get_mb_ss_sse2
|
||||
;(
|
||||
; short *src_ptr
|
||||
;)
|
||||
global sym(vp9_get_mb_ss_sse2) PRIVATE
|
||||
sym(vp9_get_mb_ss_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 1
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rax, arg(0) ;[src_ptr]
|
||||
mov rcx, 8
|
||||
pxor xmm4, xmm4
|
||||
|
||||
.NEXTROW:
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm1, [rax+16]
|
||||
movdqa xmm2, [rax+32]
|
||||
movdqa xmm3, [rax+48]
|
||||
pmaddwd xmm0, xmm0
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm4, xmm0
|
||||
paddd xmm4, xmm2
|
||||
|
||||
add rax, 0x40
|
||||
dec rcx
|
||||
ja .NEXTROW
|
||||
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm4,8
|
||||
paddd xmm4,xmm3
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm4,4
|
||||
paddd xmm4,xmm3
|
||||
movq rax,xmm4
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
|
@ -19,6 +19,21 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
|
|||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
||||
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
||||
src += 8;
|
||||
}
|
||||
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||
return _mm_cvtsi128_si32(vsum);
|
||||
}
|
||||
|
||||
#define READ64(p, stride, i) \
|
||||
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
||||
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
||||
|
|
|
@ -93,7 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
|||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
|
||||
|
|
Загрузка…
Ссылка в новой задаче