Merge "Fix bugs in convolution filter optimization" into nextgenv2
This commit is contained in:
Коммит
dd2064a0ac
|
@ -39,6 +39,7 @@ const size_t maxHeight = 256;
|
||||||
const size_t maxBlockSize = maxWidth * maxHeight;
|
const size_t maxBlockSize = maxWidth * maxHeight;
|
||||||
const int horizOffset = 32;
|
const int horizOffset = 32;
|
||||||
const int vertiOffset = 32;
|
const int vertiOffset = 32;
|
||||||
|
const size_t testMaxBlk = 128;
|
||||||
const int stride = 128;
|
const int stride = 128;
|
||||||
const int x_step_q4 = 16;
|
const int x_step_q4 = 16;
|
||||||
|
|
||||||
|
@ -142,7 +143,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
|
||||||
}
|
}
|
||||||
|
|
||||||
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
|
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||||
|
|
||||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||||
|
|
||||||
|
@ -159,7 +160,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||||
// and test again.
|
// and test again.
|
||||||
int intermediate_height =
|
int intermediate_height =
|
||||||
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
|
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
|
||||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
|
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||||
|
|
||||||
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
|
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
|
||||||
intermediate_height, filter_params, subpel_, x_step_q4,
|
intermediate_height, filter_params, subpel_, x_step_q4,
|
||||||
|
@ -174,7 +175,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
|
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
|
||||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
|
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||||
|
|
||||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||||
|
|
||||||
|
|
|
@ -139,7 +139,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
// temp's size is set to (maximum possible intermediate_height) *
|
// temp's size is set to (maximum possible intermediate_height) *
|
||||||
// MAX_BLOCK_WIDTH
|
// MAX_BLOCK_WIDTH
|
||||||
uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
|
uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
|
||||||
MAX_FILTER_TAP + 1) *
|
MAX_FILTER_TAP) *
|
||||||
MAX_BLOCK_WIDTH];
|
MAX_BLOCK_WIDTH];
|
||||||
int temp_stride = MAX_BLOCK_WIDTH;
|
int temp_stride = MAX_BLOCK_WIDTH;
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
|
@ -164,7 +164,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
|
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
|
||||||
temp + temp_stride, temp_stride, w, intermediate_height,
|
temp, temp_stride, w, intermediate_height,
|
||||||
filter_params, subpel_x_q4, x_step_q4, 0);
|
filter_params, subpel_x_q4, x_step_q4, 0);
|
||||||
|
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
|
@ -175,7 +175,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
filter_size = filter_params.taps;
|
filter_size = filter_params.taps;
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride,
|
vp10_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
|
||||||
dst, dst_stride, w, h, filter_params,
|
dst, dst_stride, w, h, filter_params,
|
||||||
subpel_y_q4, y_step_q4, ref_idx);
|
subpel_y_q4, y_step_q4, ref_idx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -160,63 +160,63 @@ DECLARE_ALIGNED(16, const int8_t,
|
||||||
sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
|
sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
|
||||||
{
|
{
|
||||||
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
|
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
|
{0, 0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
|
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
|
{0, 0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
|
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
|
{0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
|
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
|
{0, 0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
|
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
|
{0, 0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0},
|
{0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0},
|
{0, 0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0},
|
{0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0},
|
{0, 0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0},
|
{0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0},
|
{0, 0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
{0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0},
|
{0, 0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
{0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0},
|
{0, 0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0},
|
{0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0},
|
{0, 0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0},
|
{0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0},
|
{0, 0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0},
|
{0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0},
|
{0, 0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0},
|
{0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0},
|
{0, 0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
{0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0},
|
{0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0},
|
||||||
{0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0},
|
{0, 0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
#endif // CONFIG_EXT_INTERP
|
#endif // CONFIG_EXT_INTERP
|
||||||
|
|
|
@ -37,15 +37,7 @@ static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
|
||||||
// they're zero vectors.
|
// they're zero vectors.
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst);
|
typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
|
||||||
|
|
||||||
static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src,
|
|
||||||
uint8_t *dst) {
|
|
||||||
__m128i u;
|
|
||||||
(void)src;
|
|
||||||
u = _mm_packus_epi16(*x, *x);
|
|
||||||
*(int *)dst = _mm_cvtsi128_si32(u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
|
static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
@ -59,9 +51,30 @@ static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src,
|
static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
|
||||||
uint8_t *dst) {
|
uint32_t temp;
|
||||||
__m128i y = accumulate_store(x, src);
|
__m128i u = _mm_packus_epi16(*x, *x);
|
||||||
|
temp = _mm_cvtsi128_si32(u);
|
||||||
|
*(uint16_t *)dst = (uint16_t)temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
|
||||||
|
uint32_t temp;
|
||||||
|
__m128i y = accumulate_store(x, dst);
|
||||||
|
temp = _mm_cvtsi128_si32(y);
|
||||||
|
*(uint16_t *)dst = (uint16_t)temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static store_pixel_t store2pixelTab[2] = {
|
||||||
|
store_2_pixel_only, accumulate_store_2_pixel};
|
||||||
|
|
||||||
|
static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
|
||||||
|
__m128i u = _mm_packus_epi16(*x, *x);
|
||||||
|
*(int *)dst = _mm_cvtsi128_si32(u);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
|
||||||
|
__m128i y = accumulate_store(x, dst);
|
||||||
*(int *)dst = _mm_cvtsi128_si32(y);
|
*(int *)dst = _mm_cvtsi128_si32(y);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,12 +82,12 @@ static store_pixel_t store4pixelTab[2] = {
|
||||||
store_4_pixel_only, accumulate_store_4_pixel};
|
store_4_pixel_only, accumulate_store_4_pixel};
|
||||||
|
|
||||||
void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
|
void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
|
||||||
int tapsNum, store_pixel_t store_func, uint8_t *dst,
|
int tapsNum, store_pixel_t store_func, uint8_t *dst) {
|
||||||
uint8_t *buf) {
|
|
||||||
__m128i sumPairRow[4];
|
__m128i sumPairRow[4];
|
||||||
__m128i sumPairCol[8];
|
__m128i sumPairCol[8];
|
||||||
__m128i pixel;
|
__m128i pixel;
|
||||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
|
||||||
if (10 == tapsNum) {
|
if (10 == tapsNum) {
|
||||||
src -= 1;
|
src -= 1;
|
||||||
|
@ -103,52 +116,54 @@ void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
|
||||||
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
|
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
|
||||||
|
|
||||||
sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
|
sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
|
||||||
|
sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
|
||||||
|
sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
|
||||||
|
|
||||||
store_func(&sumPairRow[1], dst, buf);
|
store_func(&sumPairRow[1], dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
store_pixel_t store, uint8_t *buf) {
|
||||||
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w4_ssse3(src, f, tapsNum, store, buf);
|
||||||
src += 4;
|
src += 4;
|
||||||
buf += 4;
|
buf += 4;
|
||||||
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w4_ssse3(src, f, tapsNum, store, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
store_pixel_t store, uint8_t *buf) {
|
||||||
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w8_ssse3(src, f, tapsNum, store, buf);
|
||||||
src += 8;
|
src += 8;
|
||||||
buf += 8;
|
buf += 8;
|
||||||
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w8_ssse3(src, f, tapsNum, store, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
store_pixel_t store, uint8_t *buf) {
|
||||||
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w16_ssse3(src, f, tapsNum, store, buf);
|
||||||
src += 16;
|
src += 16;
|
||||||
buf += 16;
|
buf += 16;
|
||||||
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w16_ssse3(src, f, tapsNum, store, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
store_pixel_t store, uint8_t *buf) {
|
||||||
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w32_ssse3(src, f, tapsNum, store, buf);
|
||||||
src += 32;
|
src += 32;
|
||||||
buf += 32;
|
buf += 32;
|
||||||
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w32_ssse3(src, f, tapsNum, store, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
store_pixel_t store, uint8_t *buf) {
|
||||||
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w64_ssse3(src, f, tapsNum, store, buf);
|
||||||
src += 64;
|
src += 64;
|
||||||
buf += 64;
|
buf += 64;
|
||||||
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
|
horiz_w64_ssse3(src, f, tapsNum, store, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
|
static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
|
||||||
store_pixel_t, uint8_t *, uint8_t *) = {
|
store_pixel_t, uint8_t *) = {
|
||||||
horiz_w4_ssse3,
|
horiz_w4_ssse3,
|
||||||
horiz_w8_ssse3,
|
horiz_w8_ssse3,
|
||||||
horiz_w16_ssse3,
|
horiz_w16_ssse3,
|
||||||
|
@ -158,26 +173,28 @@ static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
|
||||||
};
|
};
|
||||||
|
|
||||||
void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
|
void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
|
||||||
store_pixel_t store, uint8_t *dst, uint8_t *buffer) {
|
store_pixel_t store, uint8_t *dst) {
|
||||||
switch (width) {
|
switch (width) {
|
||||||
|
// Note:
|
||||||
|
// For width=2 and 4, store function must be different
|
||||||
case 2:
|
case 2:
|
||||||
case 4:
|
case 4:
|
||||||
horizTab[0](src, f, tapsNum, store, dst, buffer);
|
horizTab[0](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
horizTab[1](src, f, tapsNum, store, dst, buffer);
|
horizTab[1](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
horizTab[2](src, f, tapsNum, store, dst, buffer);
|
horizTab[2](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
case 32:
|
case 32:
|
||||||
horizTab[3](src, f, tapsNum, store, dst, buffer);
|
horizTab[3](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
horizTab[4](src, f, tapsNum, store, dst, buffer);
|
horizTab[4](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
horizTab[5](src, f, tapsNum, store, dst, buffer);
|
horizTab[5](src, f, tapsNum, store, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -625,6 +642,7 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
__m128i horf[2];
|
__m128i horf[2];
|
||||||
SubpelFilterCoeffs hCoeffs, vCoeffs;
|
SubpelFilterCoeffs hCoeffs, vCoeffs;
|
||||||
const uint8_t *src_ptr;
|
const uint8_t *src_ptr;
|
||||||
|
store_pixel_t store2p = store2pixelTab[avg];
|
||||||
store_pixel_t store4p = store4pixelTab[avg];
|
store_pixel_t store4p = store4pixelTab[avg];
|
||||||
transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
|
transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
|
||||||
transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
|
transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
|
||||||
|
@ -686,11 +704,12 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
} while (count < block_height);
|
} while (count < block_height);
|
||||||
|
|
||||||
for (i = 0; i < block_residu; ++i) {
|
for (i = 0; i < block_residu; ++i) {
|
||||||
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
|
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
|
||||||
src_ptr += src_stride;
|
src_ptr += src_stride;
|
||||||
dst += dst_stride;
|
dst += dst_stride;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (w > 2) {
|
||||||
// 4-pixels parallel
|
// 4-pixels parallel
|
||||||
block_height = h >> 2;
|
block_height = h >> 2;
|
||||||
block_residu = h & 3;
|
block_residu = h & 3;
|
||||||
|
@ -710,90 +729,72 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
} while (count < block_height);
|
} while (count < block_height);
|
||||||
|
|
||||||
for (i = 0; i < block_residu; ++i) {
|
for (i = 0; i < block_residu; ++i) {
|
||||||
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
|
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
|
||||||
|
src_ptr += src_stride;
|
||||||
|
dst += dst_stride;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < h; i++) {
|
||||||
|
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
|
||||||
src_ptr += src_stride;
|
src_ptr += src_stride;
|
||||||
dst += dst_stride;
|
dst += dst_stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Vertical convolution filtering
|
// Vertical convolution filtering
|
||||||
static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src,
|
static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
|
||||||
uint8_t *dst) {
|
__m128i u = _mm_packus_epi16(*x, *x);
|
||||||
__m128i u;
|
|
||||||
uint32_t temp;
|
|
||||||
(void)src;
|
|
||||||
u = _mm_packus_epi16(*x, *x);
|
|
||||||
temp = _mm_cvtsi128_si32(u);
|
|
||||||
*(uint16_t *)dst = (uint16_t)temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src,
|
|
||||||
uint8_t *dst) {
|
|
||||||
uint32_t temp;
|
|
||||||
__m128i y = accumulate_store(x, src);
|
|
||||||
temp = _mm_cvtsi128_si32(y);
|
|
||||||
*(uint16_t *)dst = (uint16_t)temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src,
|
|
||||||
uint8_t *dst) {
|
|
||||||
__m128i u;
|
|
||||||
(void)src;
|
|
||||||
u = _mm_packus_epi16(*x, *x);
|
|
||||||
_mm_storel_epi64((__m128i *)dst, u);
|
_mm_storel_epi64((__m128i *)dst, u);
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src,
|
static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
|
||||||
uint8_t *dst) {
|
__m128i y = accumulate_store(x, dst);
|
||||||
__m128i y = accumulate_store(x, src);
|
|
||||||
_mm_storel_epi64((__m128i *)dst, y);
|
_mm_storel_epi64((__m128i *)dst, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static store_pixel_t store8pixelTab[2] = {
|
static store_pixel_t store8pixelTab[2] = {
|
||||||
store_8_pixel_only, accumulate_store_8_pixel};
|
store_8_pixel_only, accumulate_store_8_pixel};
|
||||||
|
|
||||||
static store_pixel_t store2pixelTab[2] = {
|
|
||||||
store_2_pixel_only, accumulate_store_2_pixel};
|
|
||||||
|
|
||||||
static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
|
static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
|
||||||
__m128i *f) {
|
int tapsNum, __m128i *f) {
|
||||||
|
__m128i s[12];
|
||||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
__m128i min_x2x3, max_x2x3, sum;
|
__m128i min_x2x3, max_x2x3, sum;
|
||||||
|
int i = 0;
|
||||||
|
int r = 0;
|
||||||
|
|
||||||
__m128i s0 = _mm_loadu_si128((__m128i const *)(src));
|
if (10 == tapsNum) {
|
||||||
__m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride));
|
i += 1;
|
||||||
__m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
s[0] = zero;
|
||||||
__m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
}
|
||||||
__m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
|
while (i < 12) {
|
||||||
__m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
|
s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
|
||||||
__m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
|
i += 1;
|
||||||
__m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
|
r += 1;
|
||||||
__m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride));
|
}
|
||||||
__m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride));
|
|
||||||
__m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride));
|
|
||||||
__m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride));
|
|
||||||
|
|
||||||
s0 = _mm_unpacklo_epi8(s0, s1);
|
s[0] = _mm_unpacklo_epi8(s[0], s[1]);
|
||||||
s2 = _mm_unpacklo_epi8(s2, s3);
|
s[2] = _mm_unpacklo_epi8(s[2], s[3]);
|
||||||
s4 = _mm_unpacklo_epi8(s4, s5);
|
s[4] = _mm_unpacklo_epi8(s[4], s[5]);
|
||||||
s6 = _mm_unpacklo_epi8(s6, s7);
|
s[6] = _mm_unpacklo_epi8(s[6], s[7]);
|
||||||
s8 = _mm_unpacklo_epi8(s8, s9);
|
s[8] = _mm_unpacklo_epi8(s[8], s[9]);
|
||||||
s10 = _mm_unpacklo_epi8(s10, s11);
|
s[10] = _mm_unpacklo_epi8(s[10], s[11]);
|
||||||
|
|
||||||
s0 = _mm_maddubs_epi16(s0, f[0]);
|
s[0] = _mm_maddubs_epi16(s[0], f[0]);
|
||||||
s2 = _mm_maddubs_epi16(s2, f[1]);
|
s[2] = _mm_maddubs_epi16(s[2], f[1]);
|
||||||
s4 = _mm_maddubs_epi16(s4, f[2]);
|
s[4] = _mm_maddubs_epi16(s[4], f[2]);
|
||||||
s6 = _mm_maddubs_epi16(s6, f[3]);
|
s[6] = _mm_maddubs_epi16(s[6], f[3]);
|
||||||
s8 = _mm_maddubs_epi16(s8, f[4]);
|
s[8] = _mm_maddubs_epi16(s[8], f[4]);
|
||||||
s10 = _mm_maddubs_epi16(s10, f[5]);
|
s[10] = _mm_maddubs_epi16(s[10], f[5]);
|
||||||
|
|
||||||
min_x2x3 = _mm_min_epi16(s4, s6);
|
min_x2x3 = _mm_min_epi16(s[4], s[6]);
|
||||||
max_x2x3 = _mm_max_epi16(s4, s6);
|
max_x2x3 = _mm_max_epi16(s[4], s[6]);
|
||||||
sum = _mm_adds_epi16(s0, s2);
|
sum = _mm_adds_epi16(s[0], s[2]);
|
||||||
sum = _mm_adds_epi16(sum, s10);
|
sum = _mm_adds_epi16(sum, s[10]);
|
||||||
sum = _mm_adds_epi16(sum, s8);
|
sum = _mm_adds_epi16(sum, s[8]);
|
||||||
|
|
||||||
sum = _mm_adds_epi16(sum, min_x2x3);
|
sum = _mm_adds_epi16(sum, min_x2x3);
|
||||||
sum = _mm_adds_epi16(sum, max_x2x3);
|
sum = _mm_adds_epi16(sum, max_x2x3);
|
||||||
|
@ -808,14 +809,8 @@ static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
|
||||||
__m128i *f, int tapsNum,
|
__m128i *f, int tapsNum,
|
||||||
store_pixel_t store_func,
|
store_pixel_t store_func,
|
||||||
uint8_t *dst) {
|
uint8_t *dst) {
|
||||||
__m128i sum;
|
__m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
|
||||||
|
store_func(&sum, dst);
|
||||||
if (10 == tapsNum) {
|
|
||||||
src -= src_stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
sum = filter_vert_ssse3(src, src_stride, f);
|
|
||||||
store_func(&sum, dst, dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,
|
void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,
|
||||||
|
|
Загрузка…
Ссылка в новой задаче