diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index d38313116..290bdc75e 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -194,6 +194,48 @@ class IntProColTest int16_t sum_c_; }; +typedef int (*SatdFunc)(const int16_t *coeffs, int length); +typedef std::tr1::tuple SatdTestParam; + +class SatdTest + : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + virtual void SetUp() { + satd_size_ = GET_PARAM(0); + satd_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast( + vpx_memalign(16, sizeof(*src_) * satd_size_)); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libvpx_test::ClearSystemState(); + vpx_free(src_); + } + + void FillConstant(const int16_t val) { + for (int i = 0; i < satd_size_; ++i) src_[i] = val; + } + + void FillRandom() { + for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16(); + } + + void Check(const int expected) { + int total; + ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_)); + EXPECT_EQ(expected, total); + } + + int satd_size_; + + private: + int16_t *src_; + SatdFunc satd_func_; + ACMRandom rnd_; +}; uint8_t* AverageTestBase::source_data_ = NULL; @@ -246,6 +288,36 @@ TEST_P(IntProColTest, Random) { RunComparison(); } + +TEST_P(SatdTest, MinValue) { + const int kMin = -32640; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdTest, MaxValue) { + const int kMax = 32640; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 205298; break; + case 64: expected = 1113950; break; + case 256: expected = 4268415; break; + case 1024: expected = 16954082; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -254,6 +326,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c), make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c))); +INSTANTIATE_TEST_CASE_P( + C, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_c), + make_tuple(64, &vp9_satd_c), + make_tuple(256, &vp9_satd_c), + make_tuple(1024, &vp9_satd_c))); + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, AverageTest, @@ -276,6 +356,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c), make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c), make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c))); + +INSTANTIATE_TEST_CASE_P( + SSE2, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_sse2), + make_tuple(64, &vp9_satd_sse2), + make_tuple(256, &vp9_satd_sse2), + make_tuple(1024, &vp9_satd_sse2))); #endif #if HAVE_NEON @@ -297,6 +385,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c))); + +INSTANTIATE_TEST_CASE_P( + NEON, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_neon), + make_tuple(64, &vp9_satd_neon), + make_tuple(256, &vp9_satd_neon), + make_tuple(1024, &vp9_satd_neon))); #endif #if HAVE_MSA diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 890b63821..8fe6503aa 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -209,8 +209,8 @@ specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; specialize qw/vp9_hadamard_16x16 sse2/; -add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; -specialize qw/vp9_satd sse2/; +add_proto qw/int vp9_satd/, "const int16_t *coeff, int length"; +specialize qw/vp9_satd sse2 neon/; add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; specialize qw/vp9_int_pro_row sse2 neon/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c index d569ec95d..5996bd426 100644 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { return (horizontal_add_u16x8(v_sum) + 32) >> 6; } +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int vp9_satd_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} + void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index a9a4c3050..7baa09ae5 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -117,14 +117,14 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int16_t vp9_satd_c(const int16_t *coeff, int length) { +int vp9_satd_c(const int16_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - return (int16_t)satd; + return satd; } // Integer projection onto row vectors. diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 8aafae1d4..938a527c5 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -673,7 +673,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, if (*eob == 1) *rate += (int)abs(qcoeff[0]); else if (*eob > 1) - *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4); + *rate += vp9_satd((const int16_t *)qcoeff, step << 4); *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift; } diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 4531d794a..441487130 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -283,31 +283,30 @@ void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } } -int16_t vp9_satd_sse2(const int16_t *coeff, int length) { +int vp9_satd_sse2(const int16_t *coeff, int length) { int i; - __m128i sum = _mm_load_si128((const __m128i *)coeff); - __m128i sign = _mm_srai_epi16(sum, 15); - __m128i val = _mm_xor_si128(sum, sign); - sum = _mm_sub_epi16(val, sign); - coeff += 8; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; - for (i = 8; i < length; i += 8) { - __m128i src_line = _mm_load_si128((const __m128i *)coeff); - sign = _mm_srai_epi16(src_line, 15); - val = _mm_xor_si128(src_line, sign); - val = _mm_sub_epi16(val, sign); - sum = _mm_add_epi16(sum, val); + for (i = 0; i < length; i += 8) { + const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); coeff += 8; } - val = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, val); + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } - return _mm_extract_epi16(sum, 0); + return _mm_cvtsi128_si32(accum); } void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,