From 1e6a32f1af8066fd0b718b11f00cb09104280f49 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 20 Jun 2013 15:59:48 -0700 Subject: [PATCH] SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance(). Encoding of bus @ 1500kbps (first 50 frames) goes from 3min57 to 3min35, i.e. approximately a 10.5% speedup. Note that the SIMD versions which use a bilinear filter (x_offset & 7 || y_offset & 7) aren't perfectly interleaved, and can probably be improved further in the future. I've marked this with a few TODOs/FIXMEs in the code. Change-Id: I5c9e900c0f0d32e431a50fecae213b510b2549f9 --- test/variance_test.cc | 186 +++++++++++++++- vp9/common/vp9_rtcd_defs.sh | 26 +-- vp9/encoder/x86/vp9_subpel_variance.asm | 285 +++++++++++++++++++++--- vp9/encoder/x86/vp9_variance_sse2.c | 109 +++++++-- 4 files changed, 543 insertions(+), 63 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index e7037d9d6..0d53bd8f8 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -76,6 +76,34 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, return sse - (((int64_t) se * se) >> (l2w + l2h)); } +static unsigned int subpel_avg_variance_ref(const uint8_t *ref, + const uint8_t *src, + const uint8_t *second_pred, + int l2w, int l2h, + int xoff, int yoff, + unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} + template class VarianceTest : public ::testing::TestWithParam > { @@ -174,6 +202,7 @@ class SubpelVarianceTest : rnd(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = new uint8_t[block_size_]; + sec_ = new uint8_t[block_size_]; ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(ref_ != NULL); @@ -182,14 +211,16 @@ class SubpelVarianceTest : virtual void TearDown() { delete[] src_; delete[] ref_; + delete[] sec_; } protected: void RefTest(); ACMRandom rnd; - uint8_t* src_; - uint8_t* ref_; + uint8_t *src_; + uint8_t *ref_; + uint8_t *sec_; int width_, log2width_; int height_, log2height_; int block_size_; @@ -217,6 +248,29 @@ void SubpelVarianceTest::RefTest() { } } +template<> +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + sec_[j] = rnd.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y, + src_, width_, &sse1, sec_); + const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_, + log2width_, log2height_, + x, y, &sse2); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + // ----------------------------------------------------------------------------- // VP8 test cases. @@ -283,10 +337,12 @@ namespace vp9 { #if CONFIG_VP9_ENCODER typedef VarianceTest VP9VarianceTest; typedef SubpelVarianceTest VP9SubpelVarianceTest; +typedef SubpelVarianceTest VP9SubpelAvgVarianceTest; TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } TEST_P(VP9VarianceTest, Ref) { RefTest(); } TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; @@ -360,6 +416,48 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_variance64x32_c), make_tuple(6, 6, subpel_variance64x64_c))); +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c = + vp9_sub_pixel_avg_variance4x4_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c = + vp9_sub_pixel_avg_variance4x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c = + vp9_sub_pixel_avg_variance8x4_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c = + vp9_sub_pixel_avg_variance8x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c = + vp9_sub_pixel_avg_variance8x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c = + vp9_sub_pixel_avg_variance16x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c = + vp9_sub_pixel_avg_variance16x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c = + vp9_sub_pixel_avg_variance16x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c = + vp9_sub_pixel_avg_variance32x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c = + vp9_sub_pixel_avg_variance32x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c = + vp9_sub_pixel_avg_variance32x64_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c = + vp9_sub_pixel_avg_variance64x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c = + vp9_sub_pixel_avg_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c), + make_tuple(2, 3, subpel_avg_variance4x8_c), + make_tuple(3, 2, subpel_avg_variance8x4_c), + make_tuple(3, 3, subpel_avg_variance8x8_c), + make_tuple(3, 4, subpel_avg_variance8x16_c), + make_tuple(4, 3, subpel_avg_variance16x8_c), + make_tuple(4, 4, subpel_avg_variance16x16_c), + make_tuple(4, 5, subpel_avg_variance16x32_c), + make_tuple(5, 4, subpel_avg_variance32x16_c), + make_tuple(5, 5, subpel_avg_variance32x32_c), + make_tuple(5, 6, subpel_avg_variance32x64_c), + make_tuple(6, 5, subpel_avg_variance64x32_c), + make_tuple(6, 6, subpel_avg_variance64x64_c))); + #if HAVE_MMX const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx; @@ -446,6 +544,48 @@ INSTANTIATE_TEST_CASE_P( make_tuple(5, 6, subpel_variance32x64_sse2), make_tuple(6, 5, subpel_variance64x32_sse2), make_tuple(6, 6, subpel_variance64x64_sse2))); + +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse = + vp9_sub_pixel_avg_variance4x4_sse; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse = + vp9_sub_pixel_avg_variance4x8_sse; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 = + vp9_sub_pixel_avg_variance8x4_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 = + vp9_sub_pixel_avg_variance8x8_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 = + vp9_sub_pixel_avg_variance8x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 = + vp9_sub_pixel_avg_variance16x8_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 = + vp9_sub_pixel_avg_variance16x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 = + vp9_sub_pixel_avg_variance16x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 = + vp9_sub_pixel_avg_variance32x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 = + vp9_sub_pixel_avg_variance32x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 = + vp9_sub_pixel_avg_variance32x64_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 = + vp9_sub_pixel_avg_variance64x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 = + vp9_sub_pixel_avg_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse), + make_tuple(2, 3, subpel_avg_variance4x8_sse), + make_tuple(3, 2, subpel_avg_variance8x4_sse2), + make_tuple(3, 3, subpel_avg_variance8x8_sse2), + make_tuple(3, 4, subpel_avg_variance8x16_sse2), + make_tuple(4, 3, subpel_avg_variance16x8_sse2), + make_tuple(4, 4, subpel_avg_variance16x16_sse2), + make_tuple(4, 5, subpel_avg_variance16x32_sse2), + make_tuple(5, 4, subpel_avg_variance32x16_sse2), + make_tuple(5, 5, subpel_avg_variance32x32_sse2), + make_tuple(5, 6, subpel_avg_variance32x64_sse2), + make_tuple(6, 5, subpel_avg_variance64x32_sse2), + make_tuple(6, 6, subpel_avg_variance64x64_sse2))); #endif #if HAVE_SSSE3 @@ -490,6 +630,48 @@ INSTANTIATE_TEST_CASE_P( make_tuple(5, 6, subpel_variance32x64_ssse3), make_tuple(6, 5, subpel_variance64x32_ssse3), make_tuple(6, 6, subpel_variance64x64_ssse3))); + +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 = + vp9_sub_pixel_avg_variance4x4_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 = + vp9_sub_pixel_avg_variance4x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 = + vp9_sub_pixel_avg_variance8x4_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 = + vp9_sub_pixel_avg_variance8x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 = + vp9_sub_pixel_avg_variance8x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 = + vp9_sub_pixel_avg_variance16x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 = + vp9_sub_pixel_avg_variance16x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 = + vp9_sub_pixel_avg_variance16x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 = + vp9_sub_pixel_avg_variance32x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 = + vp9_sub_pixel_avg_variance32x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 = + vp9_sub_pixel_avg_variance32x64_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 = + vp9_sub_pixel_avg_variance64x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 = + vp9_sub_pixel_avg_variance64x64_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3), + make_tuple(2, 3, subpel_avg_variance4x8_ssse3), + make_tuple(3, 2, subpel_avg_variance8x4_ssse3), + make_tuple(3, 3, subpel_avg_variance8x8_ssse3), + make_tuple(3, 4, subpel_avg_variance8x16_ssse3), + make_tuple(4, 3, subpel_avg_variance16x8_ssse3), + make_tuple(4, 4, subpel_avg_variance16x16_ssse3), + make_tuple(4, 5, subpel_avg_variance16x32_ssse3), + make_tuple(5, 4, subpel_avg_variance32x16_ssse3), + make_tuple(5, 5, subpel_avg_variance32x32_ssse3), + make_tuple(5, 6, subpel_avg_variance32x64_ssse3), + make_tuple(6, 5, subpel_avg_variance64x32_ssse3), + make_tuple(6, 6, subpel_avg_variance64x64_ssse3))); #endif #endif // CONFIG_VP9_ENCODER diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 575b619a1..8e24eed0f 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -269,81 +269,81 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x64 +specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x64 +specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x32 +specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x16 +specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x32 +specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x32 +specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x16 +specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x16 +specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x8 +specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x8 +specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x4 +specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x8 +specialize vp9_sub_pixel_avg_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance4x4 sse ssse3 #vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x4 +specialize vp9_sub_pixel_avg_variance4x4 sse ssse3 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad64x64 sse2 diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm index 35014cec8..3f883ccdc 100644 --- a/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/vp9/encoder/x86/vp9_subpel_variance.asm @@ -116,7 +116,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0 RET %endmacro -%macro SUBPEL_VARIANCE 1 ; W +%macro SUBPEL_VARIANCE 1-2 0 ; W %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 %define filter_idx_shift 4 @@ -128,12 +128,38 @@ bilin_filter_m_ssse3: times 8 db 16, 0 ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 %ifdef PIC +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse +%define sec_str sec_strideq +%else cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ dst, dst_stride, height, sse +%endif +%define h heightd %define bilin_filter sseq %else +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse +%if ARCH_X86_64 +%define h heightd +%define sec_str sec_strideq +%else +%define h dword heightm +%define sec_str sec_stridemp +%endif +%else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ dst, dst_stride, height, sse +%define h heightd +%endif %define bilin_filter bilin_filter_m %endif ASSERT %1 <= 16 ; m6 overflows if w > 16 @@ -143,7 +169,10 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ ; could perhaps use it for something more productive then pxor m5, m5 ; dedicated zero register %if %1 < 16 - sar heightd, 1 + sar h, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif %endif ; FIXME(rbultje) replace by jumptable? @@ -158,30 +187,55 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %if %1 == 16 movu m0, [srcq] mova m1, [dstq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] punpckhbw m3, m1, m5 punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] +%endif +%else ; !avg movh m2, [srcq+src_strideq] +%endif movh m1, [dstq] movh m3, [dstq+dst_strideq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_zero_y_zero_loop STORE_AND_RET @@ -196,18 +250,40 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ movu m4, [srcq+src_strideq] mova m1, [dstq] pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq*2] +%else ; mmsize == 8 + punpckldq m2, [srcq+src_strideq*2] +%endif + movh m1, [dstq] +%if mmsize == 16 + movlhps m0, m2 +%else ; mmsize == 8 + punpckldq m0, m2 +%endif + movh m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg movh m4, [srcq+src_strideq*2] movh m1, [dstq] pavgb m0, m2 @@ -217,12 +293,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_zero_y_half_loop STORE_AND_RET @@ -280,13 +360,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m2, 4 psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpckhbw m3, m1, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m2, [srcq+src_strideq] @@ -318,13 +404,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m0, 4 psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_zero_y_other_loop %undef filter_y_a %undef filter_y_b @@ -345,18 +441,37 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ movu m4, [srcq+1] mova m1, [dstq] pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m4, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] + punpckldq m4, [srcq+src_strideq+1] +%endif + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg movh m2, [srcq+src_strideq] movh m1, [dstq] pavgb m0, m4 @@ -367,12 +482,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_half_y_zero_loop STORE_AND_RET @@ -391,17 +510,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ movu m3, [srcq+1] mova m1, [dstq] pavgb m4, m3 + punpckhbw m3, m1, m5 pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else punpckhbw m2, m0, m5 punpcklbw m0, m5 - punpckhbw m3, m1, m5 punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m3, [srcq+1] @@ -410,6 +535,31 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ .x_half_y_half_loop: movh m2, [srcq] movh m3, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + punpckldq m2, [srcq+src_strideq] + punpckldq m3, [srcq+src_strideq+1] +%endif + pavgb m2, m3 +%if mmsize == 16 + movlhps m0, m2 + movhlps m4, m2 +%else ; mmsize == 8 + punpckldq m0, m2 + pshufw m4, m2, 0xe +%endif + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg movh m4, [srcq+src_strideq] movh m1, [srcq+src_strideq+1] pavgb m2, m3 @@ -422,13 +572,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 +%endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_half_y_half_loop STORE_AND_RET @@ -488,13 +642,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif punpckhbw m3, m1, m5 psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m3, [srcq+1] @@ -536,14 +696,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m0, 4 psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_half_y_other_loop %undef filter_y_a %undef filter_y_b @@ -602,13 +772,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m2, 4 psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpckhbw m3, m1, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m1, [srcq+1] @@ -642,13 +818,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m0, 4 psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_other_y_zero_loop %undef filter_x_a %undef filter_x_b @@ -724,8 +910,6 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ pavgb m0, m4 punpckhbw m3, m1, m5 punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 %else punpckhbw m2, m4, m5 punpckhbw m1, m3, m5 @@ -750,15 +934,18 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ packuswb m4, m2 punpcklbw m1, m5 pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif punpckhbw m2, m0, m5 punpcklbw m0, m5 -%endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m1, [srcq+1] @@ -810,6 +997,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ psraw m4, 4 pavgw m0, m2 pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif punpcklbw m3, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 @@ -817,8 +1011,11 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_other_y_half_loop %undef filter_x_a %undef filter_x_b @@ -941,13 +1138,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ punpckhbw m3, m1, m5 psraw m0, 4 punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq - dec heightd %else ; %1 < 16 movh m0, [srcq] movh m1, [srcq+1] @@ -1025,14 +1228,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ psraw m2, 4 punpcklbw m3, m5 punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] - dec heightd %endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h jg .x_other_y_other_loop %undef filter_x_a %undef filter_x_b @@ -1059,3 +1272,15 @@ SUBPEL_VARIANCE 4 INIT_XMM ssse3 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 + +INIT_MMX sse +SUBPEL_VARIANCE 4, 1 +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4, 1 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 68c805e23..b4ff8509c 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -343,29 +343,22 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, return (var - (((int64_t)avg * avg) >> 11)); } +#define DECL(w, opt) \ +int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse) #define DECLS(opt1, opt2) \ -int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse); \ -int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse); \ -int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse) +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) DECLS(sse2, sse); DECLS(ssse3, ssse3); #undef DECLS +#undef DECL #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ @@ -427,6 +420,86 @@ FNS(ssse3, ssse3); #undef FNS #undef FN +#define DECL(w, opt) \ +int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + const uint8_t *sec, \ + ptrdiff_t sec_stride, \ + int height, unsigned int *sse) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + unsigned int vp9_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, int src_pixels_per_line,