Merge "Fix decoder mismatch in sub-pixel AVX2 intrinsic filters"
This commit is contained in:
Коммит
a591ac9e5a
|
@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
|
|||
|
||||
// filter the source buffer
|
||||
srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
|
||||
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
|
||||
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
|
||||
|
||||
// add and saturate the results together
|
||||
srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
|
||||
|
||||
// filter the source buffer
|
||||
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
|
||||
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
|
||||
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
|
||||
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
|
||||
|
||||
// add and saturate the results together
|
||||
|
@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
|
|||
|
||||
// filter the source buffer
|
||||
srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
|
||||
srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
|
||||
srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
|
||||
|
||||
// add and saturate the results together
|
||||
srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
|
||||
|
||||
// filter the source buffer
|
||||
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
|
||||
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
|
||||
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
|
||||
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
|
||||
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
|
||||
|
||||
// add and saturate the results together
|
||||
|
@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
|
|||
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
|
||||
_mm256_castsi256_si128(filt1Reg));
|
||||
srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
|
||||
_mm256_castsi256_si128(filt2Reg));
|
||||
_mm256_castsi256_si128(filt4Reg));
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
|
||||
_mm256_castsi256_si128(firstFilters));
|
||||
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
||||
_mm256_castsi256_si128(secondFilters));
|
||||
_mm256_castsi256_si128(forthFilters));
|
||||
|
||||
// add and saturate the results together
|
||||
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
|
||||
|
||||
// filter the source buffer
|
||||
srcRegFilt3= _mm_shuffle_epi8(srcReg1,
|
||||
_mm256_castsi256_si128(filt4Reg));
|
||||
_mm256_castsi256_si128(filt2Reg));
|
||||
srcRegFilt2= _mm_shuffle_epi8(srcReg1,
|
||||
_mm256_castsi256_si128(filt3Reg));
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
|
||||
_mm256_castsi256_si128(forthFilters));
|
||||
_mm256_castsi256_si128(secondFilters));
|
||||
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
||||
_mm256_castsi256_si128(thirdFilters));
|
||||
|
||||
|
@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
|
|||
srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
|
||||
_mm256_castsi256_si128(filt1Reg));
|
||||
srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
|
||||
_mm256_castsi256_si128(filt2Reg));
|
||||
_mm256_castsi256_si128(filt4Reg));
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
|
||||
_mm256_castsi256_si128(firstFilters));
|
||||
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
||||
_mm256_castsi256_si128(secondFilters));
|
||||
_mm256_castsi256_si128(forthFilters));
|
||||
|
||||
// add and saturate the results together
|
||||
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
|
||||
|
||||
// filter the source buffer
|
||||
srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
|
||||
_mm256_castsi256_si128(filt4Reg));
|
||||
_mm256_castsi256_si128(filt2Reg));
|
||||
srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
|
||||
_mm256_castsi256_si128(filt3Reg));
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
|
||||
_mm256_castsi256_si128(forthFilters));
|
||||
_mm256_castsi256_si128(secondFilters));
|
||||
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
|
||||
_mm256_castsi256_si128(thirdFilters));
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче