From 8404253f81a91a3fe077f17a1b6ef9452a6edd48 Mon Sep 17 00:00:00 2001 From: Yi Luo Date: Fri, 24 Jun 2016 17:29:21 -0700 Subject: [PATCH] Fix bugs in convolution filter optimization - Fix the over-writing bug in horizontal filtering as width = 2. - Fix 10-tap vertical filtering which no longer reads one row of pixel above the block. - Fix 10-tap filter zero padding. - Encoder speed slow down ~4.0%, compared to, 81ad953 Convolution vertical filter SSSE3 optimization Change-Id: I9bb294a4529300081c29bf284e6bc6eb081cc536 --- test/vp10_convolve_optimz_test.cc | 7 +- vp10/common/vp10_convolve.c | 6 +- vp10/common/x86/vp10_convolve_filters_ssse3.c | 60 ++--- vp10/common/x86/vp10_convolve_ssse3.c | 245 +++++++++--------- 4 files changed, 157 insertions(+), 161 deletions(-) diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc index 04c4321d3..0d4beb042 100644 --- a/test/vp10_convolve_optimz_test.cc +++ b/test/vp10_convolve_optimz_test.cc @@ -39,6 +39,7 @@ const size_t maxHeight = 256; const size_t maxBlockSize = maxWidth * maxHeight; const int horizOffset = 32; const int vertiOffset = 32; +const size_t testMaxBlk = 128; const int stride = 128; const int x_step_q4 = 16; @@ -142,7 +143,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf, } void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { - PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_); + PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_); @@ -159,7 +160,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { // and test again. int intermediate_height = (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps; - PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_); + PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, intermediate_height, filter_params, subpel_, x_step_q4, @@ -174,7 +175,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { } void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() { - PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_); + PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_); diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c index 1abd15925..2026df198 100644 --- a/vp10/common/vp10_convolve.c +++ b/vp10/common/vp10_convolve.c @@ -139,7 +139,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst, // temp's size is set to (maximum possible intermediate_height) * // MAX_BLOCK_WIDTH uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) + - MAX_FILTER_TAP + 1) * + MAX_FILTER_TAP) * MAX_BLOCK_WIDTH]; int temp_stride = MAX_BLOCK_WIDTH; #if CONFIG_DUAL_FILTER @@ -164,7 +164,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst, assert(filter_params.taps <= MAX_FILTER_TAP); vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, - temp + temp_stride, temp_stride, w, intermediate_height, + temp, temp_stride, w, intermediate_height, filter_params, subpel_x_q4, x_step_q4, 0); #if CONFIG_DUAL_FILTER @@ -175,7 +175,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst, filter_size = filter_params.taps; assert(filter_params.taps <= MAX_FILTER_TAP); - vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride, + vp10_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, dst, dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, ref_idx); } diff --git a/vp10/common/x86/vp10_convolve_filters_ssse3.c b/vp10/common/x86/vp10_convolve_filters_ssse3.c index 3be79ff16..410da8914 100644 --- a/vp10/common/x86/vp10_convolve_filters_ssse3.c +++ b/vp10/common/x86/vp10_convolve_filters_ssse3.c @@ -159,64 +159,64 @@ DECLARE_ALIGNED(16, const int8_t, DECLARE_ALIGNED(16, const int8_t, sub_pel_filters_10sharp_signal_dir[15][2][16]) = { { - {0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0}, + {0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0}, }, { - {0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0}, - {0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0}, + {0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0}, }, { - {0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0}, + {0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0}, }, { - {0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0}, + {0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0}, }, { - {0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0}, + {0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0}, }, { - {0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0}, - {0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0}, + {0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0}, }, { - {0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0}, - {0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0}, + {0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0}, }, { - {0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0}, - {0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0}, + {0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0}, }, { - {0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0}, - {0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0}, + {0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0}, }, { - {0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0}, - {0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0}, + {0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0}, }, { - {0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0}, + {0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0}, }, { - {0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0}, + {0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0}, }, { - {0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0}, - {0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0}, + {0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0}, }, { - {0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0}, + {0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0}, }, { - {0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0}, + {0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0}, }, }; #endif // CONFIG_EXT_INTERP diff --git a/vp10/common/x86/vp10_convolve_ssse3.c b/vp10/common/x86/vp10_convolve_ssse3.c index 472990e82..75520c9ac 100644 --- a/vp10/common/x86/vp10_convolve_ssse3.c +++ b/vp10/common/x86/vp10_convolve_ssse3.c @@ -37,15 +37,7 @@ static INLINE void transpose_4x8(const __m128i *in, __m128i *out) { // they're zero vectors. } -typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst); - -static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src, - uint8_t *dst) { - __m128i u; - (void)src; - u = _mm_packus_epi16(*x, *x); - *(int *)dst = _mm_cvtsi128_si32(u); -} +typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst); static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) { const __m128i zero = _mm_setzero_si128(); @@ -59,9 +51,30 @@ static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) { return y; } -static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src, - uint8_t *dst) { - __m128i y = accumulate_store(x, src); +static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) { + uint32_t temp; + __m128i u = _mm_packus_epi16(*x, *x); + temp = _mm_cvtsi128_si32(u); + *(uint16_t *)dst = (uint16_t)temp; +} + +static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) { + uint32_t temp; + __m128i y = accumulate_store(x, dst); + temp = _mm_cvtsi128_si32(y); + *(uint16_t *)dst = (uint16_t)temp; +} + +static store_pixel_t store2pixelTab[2] = { + store_2_pixel_only, accumulate_store_2_pixel}; + +static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) { + __m128i u = _mm_packus_epi16(*x, *x); + *(int *)dst = _mm_cvtsi128_si32(u); +} + +static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) { + __m128i y = accumulate_store(x, dst); *(int *)dst = _mm_cvtsi128_si32(y); } @@ -69,12 +82,12 @@ static store_pixel_t store4pixelTab[2] = { store_4_pixel_only, accumulate_store_4_pixel}; void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, - int tapsNum, store_pixel_t store_func, uint8_t *dst, - uint8_t *buf) { + int tapsNum, store_pixel_t store_func, uint8_t *dst) { __m128i sumPairRow[4]; __m128i sumPairCol[8]; __m128i pixel; const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i zero = _mm_setzero_si128(); if (10 == tapsNum) { src -= 1; @@ -103,52 +116,54 @@ void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]); sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256); + sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]); + sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero); - store_func(&sumPairRow[1], dst, buf); + store_func(&sumPairRow[1], dst); } void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, - store_pixel_t store, uint8_t *dst, uint8_t *buf) { - horiz_w4_ssse3(src, f, tapsNum, store, dst, buf); + store_pixel_t store, uint8_t *buf) { + horiz_w4_ssse3(src, f, tapsNum, store, buf); src += 4; buf += 4; - horiz_w4_ssse3(src, f, tapsNum, store, dst, buf); + horiz_w4_ssse3(src, f, tapsNum, store, buf); } void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, - store_pixel_t store, uint8_t *dst, uint8_t *buf) { - horiz_w8_ssse3(src, f, tapsNum, store, dst, buf); + store_pixel_t store, uint8_t *buf) { + horiz_w8_ssse3(src, f, tapsNum, store, buf); src += 8; buf += 8; - horiz_w8_ssse3(src, f, tapsNum, store, dst, buf); + horiz_w8_ssse3(src, f, tapsNum, store, buf); } void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, - store_pixel_t store, uint8_t *dst, uint8_t *buf) { - horiz_w16_ssse3(src, f, tapsNum, store, dst, buf); + store_pixel_t store, uint8_t *buf) { + horiz_w16_ssse3(src, f, tapsNum, store, buf); src += 16; buf += 16; - horiz_w16_ssse3(src, f, tapsNum, store, dst, buf); + horiz_w16_ssse3(src, f, tapsNum, store, buf); } void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, - store_pixel_t store, uint8_t *dst, uint8_t *buf) { - horiz_w32_ssse3(src, f, tapsNum, store, dst, buf); + store_pixel_t store, uint8_t *buf) { + horiz_w32_ssse3(src, f, tapsNum, store, buf); src += 32; buf += 32; - horiz_w32_ssse3(src, f, tapsNum, store, dst, buf); + horiz_w32_ssse3(src, f, tapsNum, store, buf); } void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, - store_pixel_t store, uint8_t *dst, uint8_t *buf) { - horiz_w64_ssse3(src, f, tapsNum, store, dst, buf); + store_pixel_t store, uint8_t *buf) { + horiz_w64_ssse3(src, f, tapsNum, store, buf); src += 64; buf += 64; - horiz_w64_ssse3(src, f, tapsNum, store, dst, buf); + horiz_w64_ssse3(src, f, tapsNum, store, buf); } static void (*horizTab[6])(const uint8_t *, const __m128i *, int, - store_pixel_t, uint8_t *, uint8_t *) = { + store_pixel_t, uint8_t *) = { horiz_w4_ssse3, horiz_w8_ssse3, horiz_w16_ssse3, @@ -158,26 +173,28 @@ static void (*horizTab[6])(const uint8_t *, const __m128i *, int, }; void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width, - store_pixel_t store, uint8_t *dst, uint8_t *buffer) { + store_pixel_t store, uint8_t *dst) { switch (width) { + // Note: + // For width=2 and 4, store function must be different case 2: case 4: - horizTab[0](src, f, tapsNum, store, dst, buffer); + horizTab[0](src, f, tapsNum, store, dst); break; case 8: - horizTab[1](src, f, tapsNum, store, dst, buffer); + horizTab[1](src, f, tapsNum, store, dst); break; case 16: - horizTab[2](src, f, tapsNum, store, dst, buffer); + horizTab[2](src, f, tapsNum, store, dst); break; case 32: - horizTab[3](src, f, tapsNum, store, dst, buffer); + horizTab[3](src, f, tapsNum, store, dst); break; case 64: - horizTab[4](src, f, tapsNum, store, dst, buffer); + horizTab[4](src, f, tapsNum, store, dst); break; case 128: - horizTab[5](src, f, tapsNum, store, dst, buffer); + horizTab[5](src, f, tapsNum, store, dst); break; default: assert(0); @@ -625,6 +642,7 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, __m128i horf[2]; SubpelFilterCoeffs hCoeffs, vCoeffs; const uint8_t *src_ptr; + store_pixel_t store2p = store2pixelTab[avg]; store_pixel_t store4p = store4pixelTab[avg]; transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg]; transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg]; @@ -686,114 +704,97 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, } while (count < block_height); for (i = 0; i < block_residu; ++i) { - filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst); + filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst); src_ptr += src_stride; dst += dst_stride; } } else { - // 4-pixels parallel - block_height = h >> 2; - block_residu = h & 3; + if (w > 2) { + // 4-pixels parallel + block_height = h >> 2; + block_residu = h & 3; - do { - for (col = 0; col < w; col += 4) { - for (i = 0; i < 4; ++i) { - filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum, - temp + (i * 4)); - src_ptr += 1; + do { + for (col = 0; col < w; col += 4) { + for (i = 0; i < 4; ++i) { + filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum, + temp + (i * 4)); + src_ptr += 1; + } + transpose_4x4(temp, 4, dst + col, dst_stride); } - transpose_4x4(temp, 4, dst + col, dst_stride); - } - count++; - src_ptr = src + count * src_stride * 4; - dst += dst_stride * 4; - } while (count < block_height); + count++; + src_ptr = src + count * src_stride * 4; + dst += dst_stride * 4; + } while (count < block_height); - for (i = 0; i < block_residu; ++i) { - filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst); - src_ptr += src_stride; - dst += dst_stride; + for (i = 0; i < block_residu; ++i) { + filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst); + src_ptr += src_stride; + dst += dst_stride; + } + } else { + for (i = 0; i < h; i++) { + filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst); + src_ptr += src_stride; + dst += dst_stride; + } } } } // Vertical convolution filtering -static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src, - uint8_t *dst) { - __m128i u; - uint32_t temp; - (void)src; - u = _mm_packus_epi16(*x, *x); - temp = _mm_cvtsi128_si32(u); - *(uint16_t *)dst = (uint16_t)temp; -} - -static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src, - uint8_t *dst) { - uint32_t temp; - __m128i y = accumulate_store(x, src); - temp = _mm_cvtsi128_si32(y); - *(uint16_t *)dst = (uint16_t)temp; -} - -static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src, - uint8_t *dst) { - __m128i u; - (void)src; - u = _mm_packus_epi16(*x, *x); +static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) { + __m128i u = _mm_packus_epi16(*x, *x); _mm_storel_epi64((__m128i *)dst, u); } -static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src, - uint8_t *dst) { - __m128i y = accumulate_store(x, src); +static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) { + __m128i y = accumulate_store(x, dst); _mm_storel_epi64((__m128i *)dst, y); } static store_pixel_t store8pixelTab[2] = { store_8_pixel_only, accumulate_store_8_pixel}; -static store_pixel_t store2pixelTab[2] = { - store_2_pixel_only, accumulate_store_2_pixel}; - static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride, - __m128i *f) { + int tapsNum, __m128i *f) { + __m128i s[12]; const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i zero = _mm_setzero_si128(); __m128i min_x2x3, max_x2x3, sum; + int i = 0; + int r = 0; - __m128i s0 = _mm_loadu_si128((__m128i const *)(src)); - __m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride)); - __m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - __m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - __m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); - __m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); - __m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); - __m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); - __m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride)); - __m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride)); - __m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride)); - __m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride)); + if (10 == tapsNum) { + i += 1; + s[0] = zero; + } + while (i < 12) { + s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride)); + i += 1; + r += 1; + } - s0 = _mm_unpacklo_epi8(s0, s1); - s2 = _mm_unpacklo_epi8(s2, s3); - s4 = _mm_unpacklo_epi8(s4, s5); - s6 = _mm_unpacklo_epi8(s6, s7); - s8 = _mm_unpacklo_epi8(s8, s9); - s10 = _mm_unpacklo_epi8(s10, s11); + s[0] = _mm_unpacklo_epi8(s[0], s[1]); + s[2] = _mm_unpacklo_epi8(s[2], s[3]); + s[4] = _mm_unpacklo_epi8(s[4], s[5]); + s[6] = _mm_unpacklo_epi8(s[6], s[7]); + s[8] = _mm_unpacklo_epi8(s[8], s[9]); + s[10] = _mm_unpacklo_epi8(s[10], s[11]); - s0 = _mm_maddubs_epi16(s0, f[0]); - s2 = _mm_maddubs_epi16(s2, f[1]); - s4 = _mm_maddubs_epi16(s4, f[2]); - s6 = _mm_maddubs_epi16(s6, f[3]); - s8 = _mm_maddubs_epi16(s8, f[4]); - s10 = _mm_maddubs_epi16(s10, f[5]); + s[0] = _mm_maddubs_epi16(s[0], f[0]); + s[2] = _mm_maddubs_epi16(s[2], f[1]); + s[4] = _mm_maddubs_epi16(s[4], f[2]); + s[6] = _mm_maddubs_epi16(s[6], f[3]); + s[8] = _mm_maddubs_epi16(s[8], f[4]); + s[10] = _mm_maddubs_epi16(s[10], f[5]); - min_x2x3 = _mm_min_epi16(s4, s6); - max_x2x3 = _mm_max_epi16(s4, s6); - sum = _mm_adds_epi16(s0, s2); - sum = _mm_adds_epi16(sum, s10); - sum = _mm_adds_epi16(sum, s8); + min_x2x3 = _mm_min_epi16(s[4], s[6]); + max_x2x3 = _mm_max_epi16(s[4], s[6]); + sum = _mm_adds_epi16(s[0], s[2]); + sum = _mm_adds_epi16(sum, s[10]); + sum = _mm_adds_epi16(sum, s[8]); sum = _mm_adds_epi16(sum, min_x2x3); sum = _mm_adds_epi16(sum, max_x2x3); @@ -808,14 +809,8 @@ static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride, __m128i *f, int tapsNum, store_pixel_t store_func, uint8_t *dst) { - __m128i sum; - - if (10 == tapsNum) { - src -= src_stride; - } - - sum = filter_vert_ssse3(src, src_stride, f); - store_func(&sum, dst, dst); + __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f); + store_func(&sum, dst); } void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,