diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 5b4a20eb2..1724db36f 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -646,26 +646,6 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( #endif #if HAVE_AVX2 -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); -void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); -void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); -} - const ConvolveFunctions convolve8_avx2( vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3, vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3, @@ -676,9 +656,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( make_tuple(8, 4, &convolve8_avx2), make_tuple(4, 8, &convolve8_avx2), make_tuple(8, 8, &convolve8_avx2), - make_tuple(8, 16, &convolve8_avx2))); - -INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values( + make_tuple(8, 16, &convolve8_avx2), make_tuple(16, 8, &convolve8_avx2), make_tuple(16, 16, &convolve8_avx2), make_tuple(32, 16, &convolve8_avx2), diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 708f41b87..92f93184a 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -305,15 +305,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc"; $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon; add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/; +specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/; $vp9_convolve8_neon_asm=vp9_convolve8_neon; add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/; +specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/; $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon; add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/; +specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/; $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index d109e136a..3bc7d3918 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -307,7 +307,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; + __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; unsigned int src_stride, dst_stride; @@ -409,35 +409,35 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); - // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - - // multiply 2 adjacent elements with the filter and add the result srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b6, srcReg32b13)); - - // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b6, srcReg32b13)); + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);