Replacing vp9_get_mb_ss_sse2 asm implementation with intrinsics.
Change-Id: Ib4f5dd733eb2939b108070a01e83da5d9990bac0
This commit is contained in:
Родитель
89963bf586
Коммит
1f19ebbab6
|
@ -35,6 +35,14 @@ using ::std::tr1::make_tuple;
|
||||||
using ::std::tr1::tuple;
|
using ::std::tr1::tuple;
|
||||||
using libvpx_test::ACMRandom;
|
using libvpx_test::ACMRandom;
|
||||||
|
|
||||||
|
static unsigned int mb_ss_ref(const int16_t *src) {
|
||||||
|
unsigned int res = 0;
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
res += src[i] * src[i];
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
|
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
|
||||||
int l2w, int l2h, unsigned int *sse_ptr) {
|
int l2w, int l2h, unsigned int *sse_ptr) {
|
||||||
int se = 0;
|
int se = 0;
|
||||||
|
@ -76,6 +84,50 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
|
||||||
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
|
||||||
|
|
||||||
|
class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
|
||||||
|
public:
|
||||||
|
SumOfSquaresTest() : func_(GetParam()) {}
|
||||||
|
|
||||||
|
virtual ~SumOfSquaresTest() {
|
||||||
|
libvpx_test::ClearSystemState();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void ConstTest();
|
||||||
|
void RefTest();
|
||||||
|
|
||||||
|
SumOfSquaresFunction func_;
|
||||||
|
ACMRandom rnd_;
|
||||||
|
};
|
||||||
|
|
||||||
|
void SumOfSquaresTest::ConstTest() {
|
||||||
|
int16_t mem[256];
|
||||||
|
unsigned int res;
|
||||||
|
for (int v = 0; v < 256; ++v) {
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
mem[i] = v;
|
||||||
|
}
|
||||||
|
ASM_REGISTER_STATE_CHECK(res = func_(mem));
|
||||||
|
EXPECT_EQ(256u * (v * v), res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SumOfSquaresTest::RefTest() {
|
||||||
|
int16_t mem[256];
|
||||||
|
for (int i = 0; i < 100; ++i) {
|
||||||
|
for (int j = 0; j < 256; ++j) {
|
||||||
|
mem[j] = rnd_.Rand8() - rnd_.Rand8();
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned int expected = mb_ss_ref(mem);
|
||||||
|
unsigned int res;
|
||||||
|
ASM_REGISTER_STATE_CHECK(res = func_(mem));
|
||||||
|
EXPECT_EQ(expected, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename VarianceFunctionType>
|
template<typename VarianceFunctionType>
|
||||||
class VarianceTest
|
class VarianceTest
|
||||||
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
|
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
|
||||||
|
@ -362,6 +414,13 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
namespace vp9 {
|
namespace vp9 {
|
||||||
|
|
||||||
#if CONFIG_VP9_ENCODER
|
#if CONFIG_VP9_ENCODER
|
||||||
|
|
||||||
|
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
|
||||||
|
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
|
||||||
|
::testing::Values(vp9_get_mb_ss_c));
|
||||||
|
|
||||||
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
||||||
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
||||||
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
|
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
|
||||||
|
@ -487,6 +546,10 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
|
|
||||||
#if HAVE_SSE2
|
#if HAVE_SSE2
|
||||||
#if CONFIG_USE_X86INC
|
#if CONFIG_USE_X86INC
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
|
||||||
|
::testing::Values(vp9_get_mb_ss_sse2));
|
||||||
|
|
||||||
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
|
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
|
||||||
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
|
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
|
||||||
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
|
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
|
||||||
|
|
|
@ -103,8 +103,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
|
||||||
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
|
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
|
||||||
unsigned int i, sum = 0;
|
unsigned int i, sum = 0;
|
||||||
|
|
||||||
for (i = 0; i < 256; i++)
|
for (i = 0; i < 256; ++i) {
|
||||||
sum += src_ptr[i] * src_ptr[i];
|
sum += src_ptr[i] * src_ptr[i];
|
||||||
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,69 +0,0 @@
|
||||||
;
|
|
||||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
%include "vpx_ports/x86_abi_support.asm"
|
|
||||||
|
|
||||||
;unsigned int vp9_get_mb_ss_sse2
|
|
||||||
;(
|
|
||||||
; short *src_ptr
|
|
||||||
;)
|
|
||||||
global sym(vp9_get_mb_ss_sse2) PRIVATE
|
|
||||||
sym(vp9_get_mb_ss_sse2):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 1
|
|
||||||
GET_GOT rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr]
|
|
||||||
mov rcx, 8
|
|
||||||
pxor xmm4, xmm4
|
|
||||||
|
|
||||||
.NEXTROW:
|
|
||||||
movdqa xmm0, [rax]
|
|
||||||
movdqa xmm1, [rax+16]
|
|
||||||
movdqa xmm2, [rax+32]
|
|
||||||
movdqa xmm3, [rax+48]
|
|
||||||
pmaddwd xmm0, xmm0
|
|
||||||
pmaddwd xmm1, xmm1
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
pmaddwd xmm3, xmm3
|
|
||||||
|
|
||||||
paddd xmm0, xmm1
|
|
||||||
paddd xmm2, xmm3
|
|
||||||
paddd xmm4, xmm0
|
|
||||||
paddd xmm4, xmm2
|
|
||||||
|
|
||||||
add rax, 0x40
|
|
||||||
dec rcx
|
|
||||||
ja .NEXTROW
|
|
||||||
|
|
||||||
movdqa xmm3,xmm4
|
|
||||||
psrldq xmm4,8
|
|
||||||
paddd xmm4,xmm3
|
|
||||||
movdqa xmm3,xmm4
|
|
||||||
psrldq xmm4,4
|
|
||||||
paddd xmm4,xmm3
|
|
||||||
movq rax,xmm4
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
RESTORE_GOT
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
|
@ -19,6 +19,21 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
|
||||||
const unsigned char *ref, int ref_stride,
|
const unsigned char *ref, int ref_stride,
|
||||||
unsigned int *sse, int *sum);
|
unsigned int *sse, int *sum);
|
||||||
|
|
||||||
|
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
|
||||||
|
__m128i vsum = _mm_setzero_si128();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < 32; ++i) {
|
||||||
|
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
||||||
|
src += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
return _mm_cvtsi128_si32(vsum);
|
||||||
|
}
|
||||||
|
|
||||||
#define READ64(p, stride, i) \
|
#define READ64(p, stride, i) \
|
||||||
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
||||||
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
||||||
|
|
|
@ -93,7 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||||
|
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
|
||||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
||||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
|
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
|
||||||
|
|
Загрузка…
Ссылка в новой задаче