From d39448e2d408518a5e8dc7b499caeef23e69d0f4 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 31 Jul 2014 08:00:36 -0700 Subject: [PATCH] Neon version of vp9_sub_pixel_variance32x32(), vp9_variance32x32(), and vp9_get32x32var(). Change-Id: I8137e2540e50984744da59ae3a41e94f8af4a548 --- test/variance_test.cc | 9 +++- vp9/common/vp9_rtcd_defs.pl | 4 +- vp9/encoder/arm/neon/vp9_variance_neon.c | 63 +++++++++++++++++++----- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 83b7435e6..2b15ca674 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -758,15 +758,20 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_AVX2 #if HAVE_NEON const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; +const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance16x16_neon))); + ::testing::Values(make_tuple(4, 4, variance16x16_neon), + make_tuple(5, 5, variance32x32_neon))); const vp9_subpixvariance_fn_t subpel_variance16x16_neon = vp9_sub_pixel_variance16x16_neon; +const vp9_subpixvariance_fn_t subpel_variance32x32_neon = + vp9_sub_pixel_variance32x32_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon))); + ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon), + make_tuple(5, 5, subpel_variance32x32_neon))); #endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 3b1ca161d..1fce0f4e4 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -414,7 +414,7 @@ add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; @@ -477,7 +477,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x32 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index f6871188b..4a3208289 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -19,9 +19,13 @@ #include "vp9/encoder/vp9_variance.h" +enum { kAlign16 = 16 }; enum { kWidth16 = 16 }; enum { kHeight16 = 16 }; enum { kHeight16PlusOne = 17 }; +enum { kWidth32 = 32 }; +enum { kHeight32 = 32 }; +enum { kHeight32PlusOne = 33 }; enum { kPixelStepOne = 1 }; static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { @@ -93,17 +97,19 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, const int16_t *vp9_filter) { const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); - unsigned int i; + unsigned int i, j; for (i = 0; i < output_height; ++i) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi)); + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; @@ -117,8 +123,8 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, const uint8_t *dst, int dst_stride, unsigned int *sse) { - DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16); - DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, kHeight16PlusOne, kWidth16, @@ -127,3 +133,36 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); } + +void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32, + kHeight32, sse, sum); +} + +unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); +} + +unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight32PlusOne, kWidth32, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32, + kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); +}