Turn on SIMD optimization for dual_filter
Let aom_convolve8_### SIMD implementation support any block width. Turn on SIMD optimization when interpolation filter types on two directions are different. This will reduce 30% of encoding time when dual_filter and ext_interp both on. Change-Id: I539dbb2737f01835034b7269656a15b2058fa3cc
This commit is contained in:
Родитель
b968d46ab5
Коммит
7a483cffc8
|
@ -41,12 +41,19 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
|
||||||
dst += 16; \
|
dst += 16; \
|
||||||
w -= 16; \
|
w -= 16; \
|
||||||
} \
|
} \
|
||||||
if (w == 8) { \
|
while (w >= 8) { \
|
||||||
aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
|
aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
|
||||||
dst_stride, h, filter); \
|
dst_stride, h, filter); \
|
||||||
} else if (w == 4) { \
|
src += 8; \
|
||||||
|
dst += 8; \
|
||||||
|
w -= 8; \
|
||||||
|
} \
|
||||||
|
while (w >= 4) { \
|
||||||
aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
|
aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
|
||||||
dst_stride, h, filter); \
|
dst_stride, h, filter); \
|
||||||
|
src += 4; \
|
||||||
|
dst += 4; \
|
||||||
|
w -= 4; \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
while (w >= 16) { \
|
while (w >= 16) { \
|
||||||
|
@ -56,14 +63,25 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
|
||||||
dst += 16; \
|
dst += 16; \
|
||||||
w -= 16; \
|
w -= 16; \
|
||||||
} \
|
} \
|
||||||
if (w == 8) { \
|
while (w >= 8) { \
|
||||||
aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
|
aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
|
||||||
dst_stride, h, filter); \
|
dst_stride, h, filter); \
|
||||||
} else if (w == 4) { \
|
src += 8; \
|
||||||
|
dst += 8; \
|
||||||
|
w -= 8; \
|
||||||
|
} \
|
||||||
|
while (w >= 4) { \
|
||||||
aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
|
aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
|
||||||
dst_stride, h, filter); \
|
dst_stride, h, filter); \
|
||||||
|
src += 4; \
|
||||||
|
dst += 4; \
|
||||||
|
w -= 4; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
if (w) { \
|
||||||
|
aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
|
||||||
|
x_step_q4, filter_y, y_step_q4, w, h); \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define FUN_CONV_2D(avg, opt) \
|
#define FUN_CONV_2D(avg, opt) \
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "./aom_dsp_rtcd.h"
|
||||||
#include "./av1_rtcd.h"
|
#include "./av1_rtcd.h"
|
||||||
#include "av1/common/convolve.h"
|
#include "av1/common/convolve.h"
|
||||||
#include "av1/common/filter.h"
|
#include "av1/common/filter.h"
|
||||||
|
@ -104,6 +105,45 @@ static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
|
int dst_stride, int w, int h,
|
||||||
|
const InterpFilterParams filter_params,
|
||||||
|
const int subpel_x_q4, int x_step_q4, int avg) {
|
||||||
|
if (filter_params.taps == SUBPEL_TAPS) {
|
||||||
|
const int16_t *filter_x =
|
||||||
|
av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
|
||||||
|
if (avg == 0)
|
||||||
|
aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
|
||||||
|
NULL, -1, w, h);
|
||||||
|
else
|
||||||
|
aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
|
||||||
|
x_step_q4, NULL, -1, w, h);
|
||||||
|
} else {
|
||||||
|
av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||||
|
subpel_x_q4, x_step_q4, avg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
|
int dst_stride, int w, int h,
|
||||||
|
const InterpFilterParams filter_params,
|
||||||
|
const int subpel_y_q4, int y_step_q4, int avg) {
|
||||||
|
if (filter_params.taps == SUBPEL_TAPS) {
|
||||||
|
const int16_t *filter_y =
|
||||||
|
av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
|
||||||
|
if (avg == 0) {
|
||||||
|
aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
|
||||||
|
y_step_q4, w, h);
|
||||||
|
} else {
|
||||||
|
aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
|
||||||
|
filter_y, y_step_q4, w, h);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||||
|
subpel_y_q4, y_step_q4, avg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
int dst_stride, int w, int h,
|
int dst_stride, int w, int h,
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
|
@ -146,11 +186,12 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
|
av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||||
subpel_y_q4, y_step_q4, ref_idx);
|
subpel_y_q4, y_step_q4, ref_idx);
|
||||||
} else {
|
} else {
|
||||||
// temp's size is set to (maximum possible intermediate height or width) *
|
// temp's size is set to a 256 aligned value to facilitate SIMD
|
||||||
// MAX_SB_SIZE
|
// implementation. The value is greater than (maximum possible intermediate
|
||||||
uint8_t temp[((((MAX_SB_SIZE - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
|
// height or width) * MAX_SB_SIZE
|
||||||
MAX_FILTER_TAP) *
|
DECLARE_ALIGNED(16, uint8_t,
|
||||||
MAX_SB_SIZE];
|
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
|
||||||
|
int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
|
||||||
int filter_size;
|
int filter_size;
|
||||||
InterpFilterParams filter_params;
|
InterpFilterParams filter_params;
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
|
@ -171,7 +212,7 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
// complexity
|
// complexity
|
||||||
if (filter_params_y.taps < filter_params_x.taps) {
|
if (filter_params_y.taps < filter_params_x.taps) {
|
||||||
int intermediate_width;
|
int intermediate_width;
|
||||||
int temp_stride;
|
int temp_stride = max_intermediate_size;
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
filter_params = filter_params_y;
|
filter_params = filter_params_y;
|
||||||
filter_size = filter_params_x.taps;
|
filter_size = filter_params_x.taps;
|
||||||
|
@ -181,13 +222,13 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
#endif
|
#endif
|
||||||
intermediate_width =
|
intermediate_width =
|
||||||
(((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
|
(((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
|
||||||
temp_stride = intermediate_width;
|
assert(intermediate_width <= max_intermediate_size);
|
||||||
|
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
av1_convolve_vert(src - (filter_size / 2 - 1), src_stride, temp,
|
av1_convolve_vert_facade(src - (filter_size / 2 - 1), src_stride, temp,
|
||||||
temp_stride, intermediate_width, h, filter_params,
|
temp_stride, intermediate_width, h,
|
||||||
subpel_y_q4, y_step_q4, 0);
|
filter_params, subpel_y_q4, y_step_q4, 0);
|
||||||
|
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
filter_params = filter_params_x;
|
filter_params = filter_params_x;
|
||||||
|
@ -196,14 +237,14 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
#endif
|
#endif
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
av1_convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst,
|
av1_convolve_horiz_facade(temp + (filter_size / 2 - 1), temp_stride, dst,
|
||||||
dst_stride, w, h, filter_params, subpel_x_q4,
|
dst_stride, w, h, filter_params, subpel_x_q4,
|
||||||
x_step_q4, ref_idx);
|
x_step_q4, ref_idx);
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int intermediate_height;
|
int intermediate_height;
|
||||||
int temp_stride = w;
|
int temp_stride = MAX_SB_SIZE;
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
filter_params = filter_params_x;
|
filter_params = filter_params_x;
|
||||||
filter_size = filter_params_y.taps;
|
filter_size = filter_params_y.taps;
|
||||||
|
@ -213,12 +254,15 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
#endif
|
#endif
|
||||||
intermediate_height =
|
intermediate_height =
|
||||||
(((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
|
(((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
|
||||||
|
assert(intermediate_height <= max_intermediate_size);
|
||||||
|
(void)max_intermediate_size;
|
||||||
|
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
av1_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
|
av1_convolve_horiz_facade(src - src_stride * (filter_size / 2 - 1),
|
||||||
temp, temp_stride, w, intermediate_height,
|
src_stride, temp, temp_stride, w,
|
||||||
filter_params, subpel_x_q4, x_step_q4, 0);
|
intermediate_height, filter_params, subpel_x_q4,
|
||||||
|
x_step_q4, 0);
|
||||||
|
|
||||||
#if CONFIG_DUAL_FILTER
|
#if CONFIG_DUAL_FILTER
|
||||||
filter_params = filter_params_y;
|
filter_params = filter_params_y;
|
||||||
|
@ -227,9 +271,9 @@ void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||||
#endif
|
#endif
|
||||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||||
|
|
||||||
av1_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
|
av1_convolve_vert_facade(temp + temp_stride * (filter_size / 2 - 1),
|
||||||
dst, dst_stride, w, h, filter_params, subpel_y_q4,
|
temp_stride, dst, dst_stride, w, h,
|
||||||
y_step_q4, ref_idx);
|
filter_params, subpel_y_q4, y_step_q4, ref_idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче