Merge "Fix bug 804"
This commit is contained in:
Коммит
23c88870ec
|
@ -646,26 +646,6 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_AVX2
|
#if HAVE_AVX2
|
||||||
// TODO(jzern): these prototypes can be removed after the avx2 versions are
|
|
||||||
// reenabled in vp9_rtcd_defs.pl.
|
|
||||||
extern "C" {
|
|
||||||
void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
|
||||||
uint8_t *dst, ptrdiff_t dst_stride,
|
|
||||||
const int16_t *filter_x, int x_step_q4,
|
|
||||||
const int16_t *filter_y, int y_step_q4,
|
|
||||||
int w, int h);
|
|
||||||
void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
|
||||||
uint8_t *dst, ptrdiff_t dst_stride,
|
|
||||||
const int16_t *filter_x, int x_step_q4,
|
|
||||||
const int16_t *filter_y, int y_step_q4,
|
|
||||||
int w, int h);
|
|
||||||
void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
|
||||||
uint8_t *dst, ptrdiff_t dst_stride,
|
|
||||||
const int16_t *filter_x, int x_step_q4,
|
|
||||||
const int16_t *filter_y, int y_step_q4,
|
|
||||||
int w, int h);
|
|
||||||
}
|
|
||||||
|
|
||||||
const ConvolveFunctions convolve8_avx2(
|
const ConvolveFunctions convolve8_avx2(
|
||||||
vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
|
vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
|
||||||
vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
|
vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
|
||||||
|
@ -676,9 +656,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
|
||||||
make_tuple(8, 4, &convolve8_avx2),
|
make_tuple(8, 4, &convolve8_avx2),
|
||||||
make_tuple(4, 8, &convolve8_avx2),
|
make_tuple(4, 8, &convolve8_avx2),
|
||||||
make_tuple(8, 8, &convolve8_avx2),
|
make_tuple(8, 8, &convolve8_avx2),
|
||||||
make_tuple(8, 16, &convolve8_avx2)));
|
make_tuple(8, 16, &convolve8_avx2),
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
|
|
||||||
make_tuple(16, 8, &convolve8_avx2),
|
make_tuple(16, 8, &convolve8_avx2),
|
||||||
make_tuple(16, 16, &convolve8_avx2),
|
make_tuple(16, 16, &convolve8_avx2),
|
||||||
make_tuple(32, 16, &convolve8_avx2),
|
make_tuple(32, 16, &convolve8_avx2),
|
||||||
|
|
|
@ -305,15 +305,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
|
||||||
$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
|
$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
|
||||||
|
|
||||||
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
||||||
specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
|
specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/;
|
||||||
$vp9_convolve8_neon_asm=vp9_convolve8_neon;
|
$vp9_convolve8_neon_asm=vp9_convolve8_neon;
|
||||||
|
|
||||||
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
||||||
specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
|
specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/;
|
||||||
$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
|
$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
|
||||||
|
|
||||||
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
||||||
specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
|
specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/;
|
||||||
$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
|
$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
|
||||||
|
|
||||||
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
|
||||||
|
|
|
@ -307,7 +307,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
|
||||||
__m256i addFilterReg64;
|
__m256i addFilterReg64;
|
||||||
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
|
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
|
||||||
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
|
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
|
||||||
__m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
|
__m256i srcReg32b11, srcReg32b12, filtersReg32;
|
||||||
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
|
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
unsigned int src_stride, dst_stride;
|
unsigned int src_stride, dst_stride;
|
||||||
|
@ -409,35 +409,35 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
|
||||||
// multiply 2 adjacent elements with the filter and add the result
|
// multiply 2 adjacent elements with the filter and add the result
|
||||||
srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
|
srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
|
||||||
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
|
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
|
||||||
srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
|
|
||||||
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
|
|
||||||
|
|
||||||
// add and saturate the results together
|
// add and saturate the results together
|
||||||
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
|
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
|
||||||
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
|
|
||||||
|
|
||||||
|
|
||||||
// multiply 2 adjacent elements with the filter and add the result
|
// multiply 2 adjacent elements with the filter and add the result
|
||||||
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
|
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
|
||||||
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
|
|
||||||
|
|
||||||
// multiply 2 adjacent elements with the filter and add the result
|
|
||||||
srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
|
srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
|
||||||
srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
|
|
||||||
|
|
||||||
|
|
||||||
// add and saturate the results together
|
// add and saturate the results together
|
||||||
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
|
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
|
||||||
_mm256_min_epi16(srcReg32b8, srcReg32b12));
|
_mm256_min_epi16(srcReg32b8, srcReg32b12));
|
||||||
srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
|
|
||||||
_mm256_min_epi16(srcReg32b6, srcReg32b13));
|
|
||||||
|
|
||||||
// add and saturate the results together
|
|
||||||
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
|
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
|
||||||
_mm256_max_epi16(srcReg32b8, srcReg32b12));
|
_mm256_max_epi16(srcReg32b8, srcReg32b12));
|
||||||
srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
|
|
||||||
_mm256_max_epi16(srcReg32b6, srcReg32b13));
|
|
||||||
|
|
||||||
|
// multiply 2 adjacent elements with the filter and add the result
|
||||||
|
srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
|
||||||
|
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
|
||||||
|
|
||||||
|
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
|
||||||
|
|
||||||
|
// multiply 2 adjacent elements with the filter and add the result
|
||||||
|
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
|
||||||
|
srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
|
||||||
|
|
||||||
|
// add and saturate the results together
|
||||||
|
srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
|
||||||
|
_mm256_min_epi16(srcReg32b8, srcReg32b12));
|
||||||
|
srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
|
||||||
|
_mm256_max_epi16(srcReg32b8, srcReg32b12));
|
||||||
|
|
||||||
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
|
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
|
||||||
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
|
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
|
||||||
|
|
Загрузка…
Ссылка в новой задаче