Merge "Fix bugs in convolution filter optimization" into nextgenv2

This commit is contained in:
Yi Luo 2016-06-27 21:33:44 +00:00 коммит произвёл Gerrit Code Review
Родитель cc16c5d805 8404253f81
Коммит dd2064a0ac
4 изменённых файлов: 157 добавлений и 161 удалений

Просмотреть файл

@ -39,6 +39,7 @@ const size_t maxHeight = 256;
const size_t maxBlockSize = maxWidth * maxHeight;
const int horizOffset = 32;
const int vertiOffset = 32;
const size_t testMaxBlk = 128;
const int stride = 128;
const int x_step_q4 = 16;
@ -142,7 +143,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
}
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
@ -159,7 +160,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
// and test again.
int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
@ -174,7 +175,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
}
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);

Просмотреть файл

@ -139,7 +139,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
// temp's size is set to (maximum possible intermediate_height) *
// MAX_BLOCK_WIDTH
uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
MAX_FILTER_TAP + 1) *
MAX_FILTER_TAP) *
MAX_BLOCK_WIDTH];
int temp_stride = MAX_BLOCK_WIDTH;
#if CONFIG_DUAL_FILTER
@ -164,7 +164,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
assert(filter_params.taps <= MAX_FILTER_TAP);
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
temp + temp_stride, temp_stride, w, intermediate_height,
temp, temp_stride, w, intermediate_height,
filter_params, subpel_x_q4, x_step_q4, 0);
#if CONFIG_DUAL_FILTER
@ -175,7 +175,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
filter_size = filter_params.taps;
assert(filter_params.taps <= MAX_FILTER_TAP);
vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride,
vp10_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx);
}

Просмотреть файл

@ -159,64 +159,64 @@ DECLARE_ALIGNED(16, const int8_t,
DECLARE_ALIGNED(16, const int8_t,
sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
{
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
},
{
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
{0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
},
{
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
},
{
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
},
{
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
},
{
{0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0},
{0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0},
{0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0},
{0, 0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0},
},
{
{0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0},
{0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0},
},
{
{0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0},
{0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0},
},
{
{0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0},
{0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0},
},
{
{0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0},
{0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0},
{0, 0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0},
},
{
{0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0},
{0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0},
},
{
{0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0},
{0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0},
},
{
{0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0},
{0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0},
},
{
{0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0},
{0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0},
},
{
{0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0},
},
};
#endif // CONFIG_EXT_INTERP

Просмотреть файл

@ -37,15 +37,7 @@ static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
// they're zero vectors.
}
typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst);
static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
(void)src;
u = _mm_packus_epi16(*x, *x);
*(int *)dst = _mm_cvtsi128_si32(u);
}
typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
const __m128i zero = _mm_setzero_si128();
@ -59,9 +51,30 @@ static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
return y;
}
static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i y = accumulate_store(x, src);
static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
uint32_t temp;
__m128i u = _mm_packus_epi16(*x, *x);
temp = _mm_cvtsi128_si32(u);
*(uint16_t *)dst = (uint16_t)temp;
}
static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
uint32_t temp;
__m128i y = accumulate_store(x, dst);
temp = _mm_cvtsi128_si32(y);
*(uint16_t *)dst = (uint16_t)temp;
}
static store_pixel_t store2pixelTab[2] = {
store_2_pixel_only, accumulate_store_2_pixel};
static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
__m128i u = _mm_packus_epi16(*x, *x);
*(int *)dst = _mm_cvtsi128_si32(u);
}
static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
__m128i y = accumulate_store(x, dst);
*(int *)dst = _mm_cvtsi128_si32(y);
}
@ -69,12 +82,12 @@ static store_pixel_t store4pixelTab[2] = {
store_4_pixel_only, accumulate_store_4_pixel};
void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
int tapsNum, store_pixel_t store_func, uint8_t *dst,
uint8_t *buf) {
int tapsNum, store_pixel_t store_func, uint8_t *dst) {
__m128i sumPairRow[4];
__m128i sumPairCol[8];
__m128i pixel;
const __m128i k_256 = _mm_set1_epi16(1 << 8);
const __m128i zero = _mm_setzero_si128();
if (10 == tapsNum) {
src -= 1;
@ -103,52 +116,54 @@ void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
store_func(&sumPairRow[1], dst, buf);
store_func(&sumPairRow[1], dst);
}
void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
store_pixel_t store, uint8_t *buf) {
horiz_w4_ssse3(src, f, tapsNum, store, buf);
src += 4;
buf += 4;
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
horiz_w4_ssse3(src, f, tapsNum, store, buf);
}
void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
store_pixel_t store, uint8_t *buf) {
horiz_w8_ssse3(src, f, tapsNum, store, buf);
src += 8;
buf += 8;
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
horiz_w8_ssse3(src, f, tapsNum, store, buf);
}
void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
store_pixel_t store, uint8_t *buf) {
horiz_w16_ssse3(src, f, tapsNum, store, buf);
src += 16;
buf += 16;
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
horiz_w16_ssse3(src, f, tapsNum, store, buf);
}
void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
store_pixel_t store, uint8_t *buf) {
horiz_w32_ssse3(src, f, tapsNum, store, buf);
src += 32;
buf += 32;
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
horiz_w32_ssse3(src, f, tapsNum, store, buf);
}
void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
store_pixel_t store, uint8_t *buf) {
horiz_w64_ssse3(src, f, tapsNum, store, buf);
src += 64;
buf += 64;
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
horiz_w64_ssse3(src, f, tapsNum, store, buf);
}
static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
store_pixel_t, uint8_t *, uint8_t *) = {
store_pixel_t, uint8_t *) = {
horiz_w4_ssse3,
horiz_w8_ssse3,
horiz_w16_ssse3,
@ -158,26 +173,28 @@ static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
};
void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
store_pixel_t store, uint8_t *dst, uint8_t *buffer) {
store_pixel_t store, uint8_t *dst) {
switch (width) {
// Note:
// For width=2 and 4, store function must be different
case 2:
case 4:
horizTab[0](src, f, tapsNum, store, dst, buffer);
horizTab[0](src, f, tapsNum, store, dst);
break;
case 8:
horizTab[1](src, f, tapsNum, store, dst, buffer);
horizTab[1](src, f, tapsNum, store, dst);
break;
case 16:
horizTab[2](src, f, tapsNum, store, dst, buffer);
horizTab[2](src, f, tapsNum, store, dst);
break;
case 32:
horizTab[3](src, f, tapsNum, store, dst, buffer);
horizTab[3](src, f, tapsNum, store, dst);
break;
case 64:
horizTab[4](src, f, tapsNum, store, dst, buffer);
horizTab[4](src, f, tapsNum, store, dst);
break;
case 128:
horizTab[5](src, f, tapsNum, store, dst, buffer);
horizTab[5](src, f, tapsNum, store, dst);
break;
default:
assert(0);
@ -625,6 +642,7 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i horf[2];
SubpelFilterCoeffs hCoeffs, vCoeffs;
const uint8_t *src_ptr;
store_pixel_t store2p = store2pixelTab[avg];
store_pixel_t store4p = store4pixelTab[avg];
transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
@ -686,114 +704,97 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
} while (count < block_height);
for (i = 0; i < block_residu; ++i) {
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
src_ptr += src_stride;
dst += dst_stride;
}
} else {
// 4-pixels parallel
block_height = h >> 2;
block_residu = h & 3;
if (w > 2) {
// 4-pixels parallel
block_height = h >> 2;
block_residu = h & 3;
do {
for (col = 0; col < w; col += 4) {
for (i = 0; i < 4; ++i) {
filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
temp + (i * 4));
src_ptr += 1;
do {
for (col = 0; col < w; col += 4) {
for (i = 0; i < 4; ++i) {
filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
temp + (i * 4));
src_ptr += 1;
}
transpose_4x4(temp, 4, dst + col, dst_stride);
}
transpose_4x4(temp, 4, dst + col, dst_stride);
}
count++;
src_ptr = src + count * src_stride * 4;
dst += dst_stride * 4;
} while (count < block_height);
count++;
src_ptr = src + count * src_stride * 4;
dst += dst_stride * 4;
} while (count < block_height);
for (i = 0; i < block_residu; ++i) {
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
src_ptr += src_stride;
dst += dst_stride;
for (i = 0; i < block_residu; ++i) {
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
src_ptr += src_stride;
dst += dst_stride;
}
} else {
for (i = 0; i < h; i++) {
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
src_ptr += src_stride;
dst += dst_stride;
}
}
}
}
// Vertical convolution filtering
static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
uint32_t temp;
(void)src;
u = _mm_packus_epi16(*x, *x);
temp = _mm_cvtsi128_si32(u);
*(uint16_t *)dst = (uint16_t)temp;
}
static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
uint32_t temp;
__m128i y = accumulate_store(x, src);
temp = _mm_cvtsi128_si32(y);
*(uint16_t *)dst = (uint16_t)temp;
}
static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
(void)src;
u = _mm_packus_epi16(*x, *x);
static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
__m128i u = _mm_packus_epi16(*x, *x);
_mm_storel_epi64((__m128i *)dst, u);
}
static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i y = accumulate_store(x, src);
static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
__m128i y = accumulate_store(x, dst);
_mm_storel_epi64((__m128i *)dst, y);
}
static store_pixel_t store8pixelTab[2] = {
store_8_pixel_only, accumulate_store_8_pixel};
static store_pixel_t store2pixelTab[2] = {
store_2_pixel_only, accumulate_store_2_pixel};
static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
__m128i *f) {
int tapsNum, __m128i *f) {
__m128i s[12];
const __m128i k_256 = _mm_set1_epi16(1 << 8);
const __m128i zero = _mm_setzero_si128();
__m128i min_x2x3, max_x2x3, sum;
int i = 0;
int r = 0;
__m128i s0 = _mm_loadu_si128((__m128i const *)(src));
__m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride));
__m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
__m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
__m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
__m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
__m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
__m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
__m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride));
__m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride));
__m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride));
__m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride));
if (10 == tapsNum) {
i += 1;
s[0] = zero;
}
while (i < 12) {
s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
i += 1;
r += 1;
}
s0 = _mm_unpacklo_epi8(s0, s1);
s2 = _mm_unpacklo_epi8(s2, s3);
s4 = _mm_unpacklo_epi8(s4, s5);
s6 = _mm_unpacklo_epi8(s6, s7);
s8 = _mm_unpacklo_epi8(s8, s9);
s10 = _mm_unpacklo_epi8(s10, s11);
s[0] = _mm_unpacklo_epi8(s[0], s[1]);
s[2] = _mm_unpacklo_epi8(s[2], s[3]);
s[4] = _mm_unpacklo_epi8(s[4], s[5]);
s[6] = _mm_unpacklo_epi8(s[6], s[7]);
s[8] = _mm_unpacklo_epi8(s[8], s[9]);
s[10] = _mm_unpacklo_epi8(s[10], s[11]);
s0 = _mm_maddubs_epi16(s0, f[0]);
s2 = _mm_maddubs_epi16(s2, f[1]);
s4 = _mm_maddubs_epi16(s4, f[2]);
s6 = _mm_maddubs_epi16(s6, f[3]);
s8 = _mm_maddubs_epi16(s8, f[4]);
s10 = _mm_maddubs_epi16(s10, f[5]);
s[0] = _mm_maddubs_epi16(s[0], f[0]);
s[2] = _mm_maddubs_epi16(s[2], f[1]);
s[4] = _mm_maddubs_epi16(s[4], f[2]);
s[6] = _mm_maddubs_epi16(s[6], f[3]);
s[8] = _mm_maddubs_epi16(s[8], f[4]);
s[10] = _mm_maddubs_epi16(s[10], f[5]);
min_x2x3 = _mm_min_epi16(s4, s6);
max_x2x3 = _mm_max_epi16(s4, s6);
sum = _mm_adds_epi16(s0, s2);
sum = _mm_adds_epi16(sum, s10);
sum = _mm_adds_epi16(sum, s8);
min_x2x3 = _mm_min_epi16(s[4], s[6]);
max_x2x3 = _mm_max_epi16(s[4], s[6]);
sum = _mm_adds_epi16(s[0], s[2]);
sum = _mm_adds_epi16(sum, s[10]);
sum = _mm_adds_epi16(sum, s[8]);
sum = _mm_adds_epi16(sum, min_x2x3);
sum = _mm_adds_epi16(sum, max_x2x3);
@ -808,14 +809,8 @@ static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
__m128i *f, int tapsNum,
store_pixel_t store_func,
uint8_t *dst) {
__m128i sum;
if (10 == tapsNum) {
src -= src_stride;
}
sum = filter_vert_ssse3(src, src_stride, f);
store_func(&sum, dst, dst);
__m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
store_func(&sum, dst);
}
void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,