зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1899864 - Update libaom to a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 r=media-playback-reviewers,alwu
This patch simply runs the command below ``` ./mach vendor media/libaom/moz.yaml --patch-mode=none ``` to update the libaom source. Differential Revision: https://phabricator.services.mozilla.com/D212162
This commit is contained in:
Родитель
26cb5d7483
Коммит
c3dcb83cf6
|
@ -532,6 +532,12 @@ void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_
|
|||
void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
#define av1_resize_and_extend_frame av1_resize_and_extend_frame_c
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
#define av1_resize_horz_dir av1_resize_horz_dir_c
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define av1_resize_vert_dir av1_resize_vert_dir_c
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
#define av1_round_shift_array av1_round_shift_array_c
|
||||
|
||||
|
@ -624,9 +630,6 @@ cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
|
||||
#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_c
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define resize_vert_dir resize_vert_dir_c
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#include "config/aom_config.h"
|
||||
|
|
|
@ -221,7 +221,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
|
|||
RTCD_EXTERN void (*av1_compute_stats_highbd)(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
|
||||
|
||||
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
|
||||
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
|
||||
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
|
||||
RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
|
||||
|
||||
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
|
||||
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
|
||||
|
@ -687,6 +688,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
#define av1_resize_horz_dir av1_resize_horz_dir_c
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define av1_resize_vert_dir av1_resize_vert_dir_c
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -813,9 +820,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define resize_vert_dir resize_vert_dir_c
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#include "config/aom_config.h"
|
||||
|
@ -870,6 +874,8 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_NEON) av1_compute_stats = av1_compute_stats_neon;
|
||||
av1_compute_stats_highbd = av1_compute_stats_highbd_c;
|
||||
if (flags & HAS_NEON) av1_compute_stats_highbd = av1_compute_stats_highbd_neon;
|
||||
av1_convolve_2d_scale = av1_convolve_2d_scale_c;
|
||||
if (flags & HAS_NEON) av1_convolve_2d_scale = av1_convolve_2d_scale_neon;
|
||||
av1_convolve_2d_sr = av1_convolve_2d_sr_c;
|
||||
if (flags & HAS_NEON) av1_convolve_2d_sr = av1_convolve_2d_sr_neon;
|
||||
av1_convolve_2d_sr_intrabc = av1_convolve_2d_sr_intrabc_c;
|
||||
|
|
|
@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#ifdef RTCD_C
|
||||
|
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
|
||||
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
|
||||
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
|
||||
av1_resize_horz_dir = av1_resize_horz_dir_c;
|
||||
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
|
||||
av1_resize_vert_dir = av1_resize_vert_dir_c;
|
||||
if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
|
||||
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
|
||||
av1_round_shift_array = av1_round_shift_array_c;
|
||||
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
|
||||
av1_selfguided_restoration = av1_selfguided_restoration_c;
|
||||
|
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
|
|||
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
|
||||
if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
|
||||
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
|
||||
resize_vert_dir = resize_vert_dir_c;
|
||||
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#ifdef RTCD_C
|
||||
|
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
|
||||
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
|
||||
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
|
||||
av1_resize_horz_dir = av1_resize_horz_dir_c;
|
||||
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
|
||||
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
|
||||
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
|
||||
av1_round_shift_array = av1_round_shift_array_c;
|
||||
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
|
||||
av1_selfguided_restoration = av1_selfguided_restoration_c;
|
||||
|
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
|
||||
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
|
||||
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
|
||||
resize_vert_dir = resize_vert_dir_c;
|
||||
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -207,7 +207,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
|
|||
#define av1_compute_stats_highbd av1_compute_stats_highbd_neon
|
||||
|
||||
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
|
||||
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
|
||||
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
|
||||
#define av1_convolve_2d_scale av1_convolve_2d_scale_neon
|
||||
|
||||
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
|
||||
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
|
||||
|
@ -234,7 +235,9 @@ void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t
|
|||
|
||||
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
#define av1_convolve_y_sr av1_convolve_y_sr_neon
|
||||
void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
|
||||
void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
|
||||
|
@ -682,6 +685,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
#define av1_resize_and_extend_frame av1_resize_and_extend_frame_neon
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
#define av1_resize_horz_dir av1_resize_horz_dir_c
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define av1_resize_vert_dir av1_resize_vert_dir_c
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
|
||||
#define av1_round_shift_array av1_round_shift_array_neon
|
||||
|
@ -807,9 +816,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
|
||||
#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_neon
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
#define resize_vert_dir resize_vert_dir_c
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#include "config/aom_config.h"
|
||||
|
@ -830,6 +836,9 @@ static void setup_rtcd_internal(void)
|
|||
av1_convolve_x_sr = av1_convolve_x_sr_neon;
|
||||
if (flags & HAS_NEON_DOTPROD) av1_convolve_x_sr = av1_convolve_x_sr_neon_dotprod;
|
||||
if (flags & HAS_NEON_I8MM) av1_convolve_x_sr = av1_convolve_x_sr_neon_i8mm;
|
||||
av1_convolve_y_sr = av1_convolve_y_sr_neon;
|
||||
if (flags & HAS_NEON_DOTPROD) av1_convolve_y_sr = av1_convolve_y_sr_neon_dotprod;
|
||||
if (flags & HAS_NEON_I8MM) av1_convolve_y_sr = av1_convolve_y_sr_neon_i8mm;
|
||||
av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon;
|
||||
if (flags & HAS_NEON_DOTPROD) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_dotprod;
|
||||
if (flags & HAS_NEON_I8MM) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_i8mm;
|
||||
|
|
|
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#ifdef RTCD_C
|
||||
|
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
|
||||
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
|
||||
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
|
||||
av1_resize_horz_dir = av1_resize_horz_dir_c;
|
||||
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
|
||||
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
|
||||
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
|
||||
av1_round_shift_array = av1_round_shift_array_c;
|
||||
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
|
||||
av1_selfguided_restoration = av1_selfguided_restoration_c;
|
||||
|
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
|
||||
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
|
||||
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
|
||||
resize_vert_dir = resize_vert_dir_c;
|
||||
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#ifdef RTCD_C
|
||||
|
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
|
||||
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
|
||||
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
|
||||
av1_resize_horz_dir = av1_resize_horz_dir_c;
|
||||
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
|
||||
av1_resize_vert_dir = av1_resize_vert_dir_c;
|
||||
if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
|
||||
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
|
||||
av1_round_shift_array = av1_round_shift_array_c;
|
||||
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
|
||||
av1_selfguided_restoration = av1_selfguided_restoration_c;
|
||||
|
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
|
|||
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
|
||||
if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
|
||||
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
|
||||
resize_vert_dir = resize_vert_dir_c;
|
||||
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
|
|||
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
|
||||
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
|
||||
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
|
||||
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
|
||||
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
|
||||
|
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
|
|||
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
|
||||
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
|
||||
|
||||
void av1_rtcd(void);
|
||||
|
||||
#ifdef RTCD_C
|
||||
|
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
|
||||
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
|
||||
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
|
||||
av1_resize_horz_dir = av1_resize_horz_dir_c;
|
||||
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
|
||||
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
|
||||
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
|
||||
av1_round_shift_array = av1_round_shift_array_c;
|
||||
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
|
||||
av1_selfguided_restoration = av1_selfguided_restoration_c;
|
||||
|
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
|
|||
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
|
||||
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
|
||||
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
|
||||
resize_vert_dir = resize_vert_dir_c;
|
||||
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 23c94347d84241c322f3b40daf120047ff4f8d56 (Wed Apr 17 11:05:14 2024 +0000).
|
||||
release: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 (Wed May 29 23:21:38 2024 +0000).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 23c94347d84241c322f3b40daf120047ff4f8d56
|
||||
revision: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -117,6 +117,7 @@ files = {
|
|||
'../../third_party/aom/av1/av1_cx_iface.c',
|
||||
'../../third_party/aom/av1/av1_dx_iface.c',
|
||||
'../../third_party/aom/av1/common/alloccommon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
|
||||
|
@ -184,24 +185,24 @@ files = {
|
|||
'../../third_party/aom/av1/encoder/aq_complexity.c',
|
||||
'../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
|
||||
'../../third_party/aom/av1/encoder/aq_variance.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/cnn_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/ml_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
|
||||
'../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
|
||||
'../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
|
||||
'../../third_party/aom/av1/encoder/av1_noise_estimate.c',
|
||||
|
@ -394,6 +395,7 @@ files = {
|
|||
'../../third_party/aom/av1/av1_cx_iface.c',
|
||||
'../../third_party/aom/av1/av1_dx_iface.c',
|
||||
'../../third_party/aom/av1/common/alloccommon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
|
||||
'../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
|
||||
|
@ -466,26 +468,26 @@ files = {
|
|||
'../../third_party/aom/av1/encoder/aq_complexity.c',
|
||||
'../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
|
||||
'../../third_party/aom/av1/encoder/aq_variance.c',
|
||||
'../../third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c',
|
||||
'../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/cnn_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/hash_arm_crc32.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/ml_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/quantize_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
|
||||
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon_dotprod.c',
|
||||
'../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
|
||||
'../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
|
||||
'../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
|
||||
'../../third_party/aom/av1/encoder/av1_noise_estimate.c',
|
||||
|
@ -811,7 +813,6 @@ files = {
|
|||
'../../third_party/aom/aom_dsp/variance.c',
|
||||
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
|
||||
|
@ -969,6 +970,7 @@ files = {
|
|||
'../../third_party/aom/av1/common/x86/reconinter_sse4.c',
|
||||
'../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_avx2.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_sse2.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_ssse3.c',
|
||||
'../../third_party/aom/av1/common/x86/selfguided_avx2.c',
|
||||
'../../third_party/aom/av1/common/x86/selfguided_sse4.c',
|
||||
|
@ -1162,7 +1164,6 @@ files = {
|
|||
'../../third_party/aom/aom_dsp/variance.c',
|
||||
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
|
||||
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
|
||||
|
@ -1322,6 +1323,7 @@ files = {
|
|||
'../../third_party/aom/av1/common/x86/reconinter_sse4.c',
|
||||
'../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_avx2.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_sse2.c',
|
||||
'../../third_party/aom/av1/common/x86/resize_ssse3.c',
|
||||
'../../third_party/aom/av1/common/x86/selfguided_avx2.c',
|
||||
'../../third_party/aom/av1/common/x86/selfguided_sse4.c',
|
||||
|
|
|
@ -40,6 +40,7 @@ Iole Moccagatta <iole.moccagatta@gmail.com>
|
|||
Jacky Chen <jackychen@google.com>
|
||||
James Zern <jzern@google.com> <jzern@google.cOm>
|
||||
Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
|
||||
Jian Zhou <zhoujian@fb.com> <zhoujian@google.com>
|
||||
Jim Bankoski <jimbankoski@google.com>
|
||||
Johann Koenig <johannkoenig@google.com>
|
||||
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
|
||||
|
|
|
@ -51,6 +51,7 @@ Cyril Concolato <cconcolato@netflix.com>
|
|||
Dake He <dkhe@google.com>
|
||||
Damon Shen <yjshen@google.com>
|
||||
Dandan Ding <vickyddding@gmail.com>
|
||||
Daniel Cheng <dcheng@chromium.org>
|
||||
Daniele Castagna <dcastagna@chromium.org>
|
||||
Daniel Kang <ddkang@google.com>
|
||||
Daniel Max Valenzuela <daniel.vt@samsung.com>
|
||||
|
@ -94,6 +95,7 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
|
|||
Hamsalekha S <hamsalekha.s@ittiam.com>
|
||||
Hangyu Kuang <hkuang@google.com>
|
||||
Hanno Böck <hanno@hboeck.de>
|
||||
Hari Limaye <hari.limaye@arm.com>
|
||||
Harish Mahendrakar <harish.mahendrakar@ittiam.com>
|
||||
Henrik Lundin <hlundin@google.com>
|
||||
Hien Ho <hienho@google.com>
|
||||
|
@ -124,7 +126,7 @@ Jeff Muizelaar <jmuizelaar@mozilla.com>
|
|||
Jeff Petkau <jpet@chromium.org>
|
||||
Jerome Jiang <jianj@google.com>
|
||||
Jia Jia <jia.jia@linaro.org>
|
||||
Jian Zhou <zhoujian@google.com>
|
||||
Jian Zhou <zhoujian@fb.com>
|
||||
Jim Bankoski <jimbankoski@google.com>
|
||||
Jingning Han <jingning@google.com>
|
||||
Joe Young <joeyoung@google.com>
|
||||
|
@ -216,6 +218,7 @@ Peter Boström <pbos@google.com>
|
|||
Peter de Rivaz <peter.derivaz@gmail.com>
|
||||
Peter Kasting <pkasting@chromium.org>
|
||||
Philip Jägenstedt <philipj@opera.com>
|
||||
Philippe Antoine <p.antoine@catenacyber.fr>
|
||||
Priit Laes <plaes@plaes.org>
|
||||
Qiu Jianlin <jianlin.qiu@intel.com>
|
||||
Rachel Barker <rachelbarker@google.com>
|
||||
|
|
|
@ -1,3 +1,91 @@
|
|||
2024-04-09 v3.9.0
|
||||
This release includes new codec interfaces, compression efficiency and
|
||||
perceptual improvements, speedup for RTC for both video and screen content,
|
||||
and many bug fixes. This release is ABI compatible with the previous release.
|
||||
|
||||
- New Features
|
||||
* New codec control
|
||||
* AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to
|
||||
only drop spatial layers or the whole superframe.
|
||||
* Active Map is fixed and tested for RTC.
|
||||
* CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom
|
||||
decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when
|
||||
both are disabled.
|
||||
* libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4.
|
||||
|
||||
- Compression Efficiency Improvements
|
||||
* RTC encoding improvements
|
||||
* 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate
|
||||
gain on scrolling content.
|
||||
|
||||
- Perceptual Quality Improvements
|
||||
* For RTC screen content
|
||||
* Reduced color artifacts for RTC screen content
|
||||
* Visual quality improved for scene changes for SVC with quality layers.
|
||||
* Removed visual artifacts for speed 11
|
||||
|
||||
- Speedups:
|
||||
* RTC Speed 11: aggressive speedup setting added for video mode,
|
||||
resolutions <= VGA: ~30% faster than speed 10.
|
||||
* 5-9% speed up for high bit-depth encoding with good mode on Arm, half of
|
||||
which comes from SVE/SVE2 optimizations.
|
||||
|
||||
- Other improvements
|
||||
* Further improvements to global motion estimation.
|
||||
* Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm.
|
||||
* Remove unneeded SIMD functions, saving >100 KiB from binary size.
|
||||
* Cleaned up and improved pattern_search.
|
||||
* Added end-to-end c vs SIMD bit-exactness test.
|
||||
* Added config flag to calc psnr using libvmaf peak: use a slightly
|
||||
different peak value for PSNR (1020 and 2040 for 10- and 12-bit)
|
||||
|
||||
- Bug Fixes
|
||||
* Fuzzing bug fixes
|
||||
* b/329485898 Null-dereference WRITE in av1_cdef_frame_mt
|
||||
* b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16
|
||||
* b/329813868 Ill in av1_cdef_frame_mt
|
||||
* chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row
|
||||
* b/330014723 Null-dereference WRITE in
|
||||
cdef_copy_rect8_16bit_to_16bit_avx2
|
||||
* b/310455204 Null-dereference WRITE in prepare_enc_workers
|
||||
* b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2
|
||||
* oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <=
|
||||
(pbi->output_frame_width_in_tiles_minus_1 + 1)
|
||||
* oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w
|
||||
* oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h
|
||||
* oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize
|
||||
* oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in
|
||||
od_ec_decode_bool_q15
|
||||
* oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init
|
||||
* oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in
|
||||
od_ec_dec_normalize
|
||||
* oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in
|
||||
get_ls_tile_buffers
|
||||
* libaom library
|
||||
* aomedia:3510 Large value of duration could cause encoder overflow
|
||||
* chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx
|
||||
in Win ARM64 builds
|
||||
* aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after
|
||||
59c592bb8
|
||||
* aomedia:3531 Exception encountered with PSNR calculation
|
||||
* aomedia:3541 Can not compile correctly by CYGWIN
|
||||
* chromium:41482688 heap-buffer-overflow write in vpx_img_read()
|
||||
(tools_common.c) with VPX_IMG_FMT_NV12
|
||||
* aomedia:3521 Assertion failures on Arm in CNNTest.* in
|
||||
av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and
|
||||
av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon
|
||||
* aomedia:3486 C vs NEON mismatch in AV1 encoder
|
||||
* aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon()
|
||||
* aomedia:3276 Significant progress on ensuring all allocations are
|
||||
checked
|
||||
* aomedia:3491 heap-buffer-overflow encoding frames of size 256x256,
|
||||
512x512 in good quality usage mode using 4 threads
|
||||
* aomedia:3322 PSNR number discrepancy
|
||||
* aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni
|
||||
* aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on
|
||||
aom/av1/encoder/motion_search_facade.c
|
||||
* aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case
|
||||
|
||||
2024-03-08 v3.8.2
|
||||
This release includes several bug fixes. This release is ABI
|
||||
compatible with the last release. See
|
||||
|
|
|
@ -58,9 +58,9 @@ endif()
|
|||
# passed to libtool.
|
||||
#
|
||||
# We set SO_FILE_VERSION = [c-a].a.r
|
||||
set(LT_CURRENT 11)
|
||||
set(LT_REVISION 2)
|
||||
set(LT_AGE 8)
|
||||
set(LT_CURRENT 12)
|
||||
set(LT_REVISION 0)
|
||||
set(LT_AGE 9)
|
||||
math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
|
||||
set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
|
||||
unset(LT_CURRENT)
|
||||
|
|
|
@ -637,6 +637,7 @@ typedef struct aom_codec_enc_cfg {
|
|||
/*!\brief Target data rate
|
||||
*
|
||||
* Target bitrate to use for this stream, in kilobits per second.
|
||||
* Max allowed value is 2000000
|
||||
*/
|
||||
unsigned int rc_target_bitrate;
|
||||
|
||||
|
|
|
@ -182,7 +182,9 @@ static aom_image_t *img_alloc_helper(
|
|||
|
||||
/* Default viewport to entire image. (This aom_img_set_rect call always
|
||||
* succeeds.) */
|
||||
aom_img_set_rect(img, 0, 0, d_w, d_h, border);
|
||||
int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border);
|
||||
assert(ret == 0);
|
||||
(void)ret;
|
||||
return img;
|
||||
|
||||
fail:
|
||||
|
|
|
@ -58,7 +58,6 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2
|
|||
|
||||
list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
|
||||
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
|
||||
"${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
|
||||
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
|
||||
"${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
|
||||
"${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_convolve8_neon.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
|
@ -231,29 +232,6 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
|
|||
}
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
const int16x4_t filter) {
|
||||
int16x4_t sum = vmul_lane_s16(s0, filter, 0);
|
||||
sum = vmla_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmla_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmla_lane_s16(sum, s3, filter, 3);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t filter) {
|
||||
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter, 3);
|
||||
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
|
||||
ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride,
|
||||
|
@ -265,26 +243,20 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
|
|||
|
||||
if (w == 4) {
|
||||
do {
|
||||
int16x8_t t0 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride)));
|
||||
int16x8_t t1 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride)));
|
||||
uint8x8_t t01[4];
|
||||
|
||||
int16x4_t s0[4], s1[4];
|
||||
s0[0] = vget_low_s16(t0);
|
||||
s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
|
||||
s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
|
||||
s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
|
||||
t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
|
||||
t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
|
||||
t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
|
||||
t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
|
||||
|
||||
s1[0] = vget_low_s16(t1);
|
||||
s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
|
||||
s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
|
||||
s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
|
||||
int16x8_t s01[4];
|
||||
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
|
||||
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
|
||||
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
|
||||
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
|
||||
|
||||
int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
|
||||
int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
|
||||
// We halved the filter values so -1 from right shift.
|
||||
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
|
||||
uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
|
||||
|
||||
|
@ -298,37 +270,27 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
|
|||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
|
||||
int16x8_t t0 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
|
||||
int16x8_t t1 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
|
||||
|
||||
s += 8;
|
||||
do {
|
||||
int16x8_t t2 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
|
||||
int16x8_t t3 =
|
||||
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
|
||||
uint8x8_t t0[4], t1[4];
|
||||
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
|
||||
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
|
||||
|
||||
int16x8_t s0[4], s1[4];
|
||||
s0[0] = t0;
|
||||
s0[1] = vextq_s16(t0, t2, 1);
|
||||
s0[2] = vextq_s16(t0, t2, 2);
|
||||
s0[3] = vextq_s16(t0, t2, 3);
|
||||
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
|
||||
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
|
||||
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
|
||||
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
|
||||
|
||||
s1[0] = t1;
|
||||
s1[1] = vextq_s16(t1, t3, 1);
|
||||
s1[2] = vextq_s16(t1, t3, 2);
|
||||
s1[3] = vextq_s16(t1, t3, 3);
|
||||
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
|
||||
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
|
||||
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
|
||||
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
|
||||
|
||||
uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
|
||||
uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
|
||||
|
||||
store_u8_8x2(d, dst_stride, d0, d1);
|
||||
|
||||
t0 = t2;
|
||||
t1 = t3;
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
|
@ -354,7 +316,12 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
|
|||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1);
|
||||
|
||||
if (get_filter_taps_convolve8(filter_x) <= 4) {
|
||||
int filter_taps = get_filter_taps_convolve8(filter_x);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
|
||||
h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
|
||||
h);
|
||||
} else {
|
||||
|
@ -362,22 +329,13 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
|
|||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4, int w,
|
||||
int h) {
|
||||
static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
|
||||
ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *filter_y, int w,
|
||||
int h) {
|
||||
const int16x8_t filter = vld1q_s16(filter_y);
|
||||
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
if (w == 4) {
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6;
|
||||
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
|
||||
|
@ -472,3 +430,30 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
|
|||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4, int w,
|
||||
int h) {
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
int filter_taps = get_filter_taps_convolve8(filter_y);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else {
|
||||
convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,285 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
|
||||
#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
||||
static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
|
||||
ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int w,
|
||||
int h) {
|
||||
// Bilinear filter values are all positive.
|
||||
const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]);
|
||||
const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]);
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint8x8_t s0 =
|
||||
load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride);
|
||||
uint8x8_t s1 =
|
||||
load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride);
|
||||
uint8x8_t s2 =
|
||||
load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride);
|
||||
uint8x8_t s3 =
|
||||
load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(s0, f0);
|
||||
sum0 = vmlal_u8(sum0, s1, f1);
|
||||
uint16x8_t sum1 = vmull_u8(s2, f0);
|
||||
sum1 = vmlal_u8(sum1, s3, f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else if (w == 8) {
|
||||
do {
|
||||
uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0);
|
||||
uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1);
|
||||
uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0);
|
||||
uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(s0, f0);
|
||||
sum0 = vmlal_u8(sum0, s1, f1);
|
||||
uint16x8_t sum1 = vmull_u8(s2, f0);
|
||||
sum1 = vmlal_u8(sum1, s3, f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
vst1_u8(dst + 0 * dst_stride, d0);
|
||||
vst1_u8(dst + 1 * dst_stride, d1);
|
||||
|
||||
src += 2 * src_stride;
|
||||
dst += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
|
||||
do {
|
||||
uint8x16_t s0 = vld1q_u8(s + 0);
|
||||
uint8x16_t s1 = vld1q_u8(s + 1);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
|
||||
sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
|
||||
uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
|
||||
sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
vst1q_u8(d, vcombine_u8(d0, d1));
|
||||
|
||||
s += 16;
|
||||
d += 16;
|
||||
width -= 16;
|
||||
} while (width != 0);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
} while (--h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t filter) {
|
||||
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter, 3);
|
||||
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
|
||||
ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *filter_y, int w,
|
||||
int h) {
|
||||
// All filter values are even, halve to reduce intermediate precision
|
||||
// requirements.
|
||||
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
|
||||
|
||||
if (w == 4) {
|
||||
uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
|
||||
uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
|
||||
|
||||
int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
|
||||
|
||||
src += 2 * src_stride;
|
||||
|
||||
do {
|
||||
uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
|
||||
uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
|
||||
uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
|
||||
uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
|
||||
|
||||
int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
|
||||
int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
|
||||
int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
|
||||
int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
|
||||
|
||||
uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter);
|
||||
uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
s01 = s45;
|
||||
s12 = s56;
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h != 0);
|
||||
} else {
|
||||
do {
|
||||
uint8x8_t t0, t1, t2;
|
||||
load_u8_8x3(src, src_stride, &t0, &t1, &t2);
|
||||
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
|
||||
int height = h;
|
||||
const uint8_t *s = src + 3 * src_stride;
|
||||
uint8_t *d = dst;
|
||||
|
||||
do {
|
||||
uint8x8_t t3;
|
||||
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
|
||||
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
|
||||
uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter);
|
||||
uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter);
|
||||
uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter);
|
||||
uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter);
|
||||
|
||||
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height != 0);
|
||||
src += 8;
|
||||
dst += 8;
|
||||
w -= 8;
|
||||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void convolve8_vert_2tap_neon(const uint8_t *src,
|
||||
ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *filter_y, int w,
|
||||
int h) {
|
||||
// Bilinear filter values are all positive.
|
||||
uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]);
|
||||
uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]);
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
|
||||
uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
|
||||
uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
|
||||
uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(s0, f0);
|
||||
sum0 = vmlal_u8(sum0, s1, f1);
|
||||
uint16x8_t sum1 = vmull_u8(s2, f0);
|
||||
sum1 = vmlal_u8(sum1, s3, f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else if (w == 8) {
|
||||
do {
|
||||
uint8x8_t s0, s1, s2;
|
||||
load_u8_8x3(src, src_stride, &s0, &s1, &s2);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(s0, f0);
|
||||
sum0 = vmlal_u8(sum0, s1, f1);
|
||||
uint16x8_t sum1 = vmull_u8(s1, f0);
|
||||
sum1 = vmlal_u8(sum1, s2, f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
vst1_u8(dst + 0 * dst_stride, d0);
|
||||
vst1_u8(dst + 1 * dst_stride, d1);
|
||||
|
||||
src += 2 * src_stride;
|
||||
dst += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
|
||||
do {
|
||||
uint8x16_t s0 = vld1q_u8(s + 0 * src_stride);
|
||||
uint8x16_t s1 = vld1q_u8(s + 1 * src_stride);
|
||||
|
||||
uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
|
||||
sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
|
||||
uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
|
||||
sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
|
||||
|
||||
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
|
||||
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
|
||||
|
||||
vst1q_u8(d, vcombine_u8(d0, d1));
|
||||
|
||||
s += 16;
|
||||
d += 16;
|
||||
width -= 16;
|
||||
} while (width != 0);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
} while (--h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
|
|
@ -20,6 +20,8 @@
|
|||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_convolve8_neon.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
@ -93,22 +95,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
|
|||
return vqrshrun_n_s16(sum, FILTER_BITS);
|
||||
}
|
||||
|
||||
void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
static INLINE void convolve8_horiz_8tap_neon_dotprod(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
|
||||
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
|
||||
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)x_step_q4;
|
||||
(void)filter_y;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1);
|
||||
|
||||
if (w == 4) {
|
||||
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
|
||||
do {
|
||||
|
@ -158,6 +149,141 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
|||
}
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
|
||||
const int8x8_t filters,
|
||||
const uint8x16_t permute_tbl) {
|
||||
// Transform sample range to [-128, 127] for 8-bit signed dot product.
|
||||
int8x16_t samples_128 =
|
||||
vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
|
||||
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
|
||||
|
||||
// Accumulate into 128 * FILTER_WEIGHT to account for range transform.
|
||||
// (Divide by 2 since we halved the filter values.)
|
||||
int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
|
||||
int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
|
||||
|
||||
// Further narrowing and packing is performed by the caller.
|
||||
return vmovn_s32(sum);
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
|
||||
const int8x8_t filters,
|
||||
const uint8x16x2_t permute_tbl) {
|
||||
// Transform sample range to [-128, 127] for 8-bit signed dot product.
|
||||
int8x16_t samples_128 =
|
||||
vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
|
||||
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
|
||||
int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
|
||||
vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
|
||||
|
||||
// Accumulate into 128 * FILTER_WEIGHT to account for range transform.
|
||||
// (Divide by 2 since we halved the filter values.)
|
||||
int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
|
||||
// First 4 output values.
|
||||
int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
|
||||
// Second 4 output values.
|
||||
int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
|
||||
|
||||
// Narrow and re-pack.
|
||||
int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve8_horiz_4tap_neon_dotprod(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
|
||||
const int16x4_t x_filter = vld1_s16(filter_x + 2);
|
||||
// All 4-tap and bilinear filter values are even, so halve them to reduce
|
||||
// intermediate precision requirements.
|
||||
const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
|
||||
|
||||
if (width == 4) {
|
||||
const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
|
||||
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
|
||||
|
||||
int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
|
||||
int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
|
||||
int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
|
||||
int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
|
||||
// We halved the filter values so -1 from right shift.
|
||||
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
|
||||
uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
} else {
|
||||
const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
|
||||
|
||||
do {
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
int w = width;
|
||||
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
|
||||
|
||||
uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
|
||||
uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
|
||||
uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
|
||||
uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
|
||||
|
||||
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
w -= 8;
|
||||
} while (w != 0);
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)x_step_q4;
|
||||
(void)filter_y;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1);
|
||||
|
||||
int filter_taps = get_filter_taps_convolve8(filter_x);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
|
||||
h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
|
||||
filter_x, w, h);
|
||||
} else {
|
||||
convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride,
|
||||
filter_x, w, h);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
|
||||
int8x8_t a3, int8x16_t *b) {
|
||||
// Transpose 8-bit elements and concatenate result rows as follows:
|
||||
|
@ -244,24 +370,13 @@ static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
|
|||
return vqrshrun_n_s16(sum, FILTER_BITS);
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
static INLINE void convolve8_vert_8tap_neon_dotprod(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
|
||||
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
|
||||
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
|
||||
int8x16x2_t samples_LUT;
|
||||
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
if (w == 4) {
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6;
|
||||
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
|
||||
|
@ -410,3 +525,31 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
|||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
int filter_taps = get_filter_taps_convolve8(filter_y);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else {
|
||||
convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y,
|
||||
w, h);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_convolve8_neon.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
@ -80,22 +82,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
|
|||
return vqrshrun_n_s16(sum, FILTER_BITS);
|
||||
}
|
||||
|
||||
void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
static INLINE void convolve8_horiz_8tap_neon_i8mm(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
|
||||
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
|
||||
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)x_step_q4;
|
||||
(void)filter_y;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1);
|
||||
|
||||
if (w == 4) {
|
||||
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
|
||||
do {
|
||||
|
@ -145,6 +136,128 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
|||
}
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
|
||||
const int8x8_t filters,
|
||||
const uint8x16_t permute_tbl) {
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
|
||||
|
||||
int32x4_t sum =
|
||||
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
|
||||
|
||||
// Further narrowing and packing is performed by the caller.
|
||||
return vmovn_s32(sum);
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
|
||||
const int8x8_t filters,
|
||||
const uint8x16x2_t permute_tbl) {
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
|
||||
uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
|
||||
vqtbl1q_u8(samples, permute_tbl.val[1]) };
|
||||
|
||||
// First 4 output values.
|
||||
int32x4_t sum0 =
|
||||
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
|
||||
// Second 4 output values.
|
||||
int32x4_t sum1 =
|
||||
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
|
||||
|
||||
// Narrow and re-pack.
|
||||
int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve8_horiz_4tap_neon_i8mm(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
|
||||
const int16x4_t x_filter = vld1_s16(filter_x + 2);
|
||||
// All 4-tap and bilinear filter values are even, so halve them to reduce
|
||||
// intermediate precision requirements.
|
||||
const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
|
||||
|
||||
if (width == 4) {
|
||||
const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
|
||||
|
||||
int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl);
|
||||
int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl);
|
||||
int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl);
|
||||
int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl);
|
||||
// We halved the filter values so -1 from right shift.
|
||||
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
|
||||
uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
} else {
|
||||
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
|
||||
|
||||
do {
|
||||
int w = width;
|
||||
const uint8_t *s = src;
|
||||
uint8_t *d = dst;
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
|
||||
|
||||
uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl);
|
||||
uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl);
|
||||
uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl);
|
||||
uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl);
|
||||
|
||||
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
w -= 8;
|
||||
} while (w != 0);
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)x_step_q4;
|
||||
(void)filter_y;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1);
|
||||
|
||||
int filter_taps = get_filter_taps_convolve8(filter_x);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
|
||||
h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
|
||||
filter_x, w, h);
|
||||
} else {
|
||||
convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
|
||||
w, h);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
|
||||
uint8x8_t a2, uint8x8_t a3,
|
||||
uint8x16_t *b) {
|
||||
|
@ -227,24 +340,13 @@ static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
|
|||
return vqrshrun_n_s16(sum, FILTER_BITS);
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4, int w,
|
||||
int h) {
|
||||
static INLINE void convolve8_vert_8tap_neon_i8mm(
|
||||
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
|
||||
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
|
||||
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
|
||||
uint8x16x2_t samples_LUT;
|
||||
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
if (w == 4) {
|
||||
uint8x8_t s0, s1, s2, s3, s4, s5, s6;
|
||||
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
|
||||
|
@ -365,3 +467,31 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
|||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4, int w,
|
||||
int h) {
|
||||
assert((intptr_t)dst % 4 == 0);
|
||||
assert(dst_stride % 4 == 0);
|
||||
|
||||
(void)filter_x;
|
||||
(void)x_step_q4;
|
||||
(void)y_step_q4;
|
||||
|
||||
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
|
||||
|
||||
int filter_taps = get_filter_taps_convolve8(filter_y);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else if (filter_taps == 4) {
|
||||
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
|
||||
filter_y, w, h);
|
||||
} else {
|
||||
convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
|
||||
h);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,9 @@
|
|||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
|
||||
uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
|
||||
uint16x8_t round_offset) {
|
||||
static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a,
|
||||
uint16x8_t b,
|
||||
uint16x8_t round_offset) {
|
||||
const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
|
||||
|
||||
uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <arm_neon.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/txfm_common.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
@ -115,6 +116,7 @@ void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
|
|||
vst1q_s16(final_output + 1 * 8, out_23);
|
||||
}
|
||||
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
|
||||
// stage 1
|
||||
int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
|
||||
|
@ -302,3 +304,4 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
|
|||
vst1q_s16(&final_output[7 * 8], input_7);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_INTERNAL_STATS
|
||||
|
|
|
@ -19,199 +19,208 @@
|
|||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/highbd_convolve8_neon.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
static INLINE int32x4_t highbd_convolve8_4_s32(
|
||||
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
|
||||
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
|
||||
const int16x4_t y_filter_lo = vget_low_s16(y_filter);
|
||||
const int16x4_t y_filter_hi = vget_high_s16(y_filter);
|
||||
static INLINE uint16x4_t
|
||||
highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
|
||||
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7,
|
||||
const int16x8_t filter, const uint16x4_t max) {
|
||||
const int16x4_t filter_lo = vget_low_s16(filter);
|
||||
const int16x4_t filter_hi = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
|
||||
sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
|
||||
sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
|
||||
sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
|
||||
sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
|
||||
int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
|
||||
sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
|
||||
sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
|
||||
sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
|
||||
sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
|
||||
|
||||
return sum;
|
||||
uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
|
||||
|
||||
return vmin_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
|
||||
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
|
||||
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
|
||||
int32x4_t sum =
|
||||
highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
|
||||
static INLINE uint16x8_t
|
||||
highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
|
||||
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7,
|
||||
const int16x8_t filter, const uint16x8_t max) {
|
||||
const int16x4_t filter_lo = vget_low_s16(filter);
|
||||
const int16x4_t filter_hi = vget_high_s16(filter);
|
||||
|
||||
return vqrshrun_n_s32(sum, FILTER_BITS);
|
||||
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3);
|
||||
|
||||
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3);
|
||||
|
||||
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
|
||||
vqrshrun_n_s32(sum1, FILTER_BITS));
|
||||
|
||||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE int32x4_t highbd_convolve8_horiz4_s32(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
|
||||
const int16x8_t s2 = vextq_s16(s0, s1, 1);
|
||||
const int16x8_t s3 = vextq_s16(s0, s1, 2);
|
||||
const int16x8_t s4 = vextq_s16(s0, s1, 3);
|
||||
const int16x4_t s0_lo = vget_low_s16(s0);
|
||||
const int16x4_t s1_lo = vget_low_s16(s2);
|
||||
const int16x4_t s2_lo = vget_low_s16(s3);
|
||||
const int16x4_t s3_lo = vget_low_s16(s4);
|
||||
const int16x4_t s4_lo = vget_high_s16(s0);
|
||||
const int16x4_t s5_lo = vget_high_s16(s2);
|
||||
const int16x4_t s6_lo = vget_high_s16(s3);
|
||||
const int16x4_t s7_lo = vget_high_s16(s4);
|
||||
|
||||
return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
|
||||
s7_lo, x_filter_0_7);
|
||||
}
|
||||
|
||||
static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
|
||||
int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
|
||||
|
||||
return vqrshrun_n_s32(sum, FILTER_BITS);
|
||||
}
|
||||
|
||||
static INLINE void highbd_convolve8_8_s32(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
|
||||
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
|
||||
int32x4_t *sum0, int32x4_t *sum1) {
|
||||
const int16x4_t y_filter_lo = vget_low_s16(y_filter);
|
||||
const int16x4_t y_filter_hi = vget_high_s16(y_filter);
|
||||
|
||||
*sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
|
||||
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
|
||||
|
||||
*sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
|
||||
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
|
||||
}
|
||||
|
||||
static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
|
||||
const int16x8_t s0_hi,
|
||||
const int16x8_t x_filter_0_7,
|
||||
int32x4_t *sum0,
|
||||
int32x4_t *sum1) {
|
||||
const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
|
||||
const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
|
||||
const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
|
||||
const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
|
||||
const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
|
||||
const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
|
||||
const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
|
||||
|
||||
highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
|
||||
sum1);
|
||||
}
|
||||
|
||||
static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
|
||||
int32x4_t sum0, sum1;
|
||||
highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
|
||||
|
||||
return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
|
||||
vqrshrun_n_s32(sum1, FILTER_BITS));
|
||||
}
|
||||
|
||||
static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
|
||||
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
|
||||
int32x4_t sum0;
|
||||
int32x4_t sum1;
|
||||
highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
|
||||
&sum1);
|
||||
|
||||
return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
|
||||
vqrshrun_n_s32(sum1, FILTER_BITS));
|
||||
}
|
||||
|
||||
static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *x_filter_ptr,
|
||||
int x_step_q4, int w, int h, int bd) {
|
||||
static void highbd_convolve_horiz_8tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
|
||||
|
||||
if (w == 4) {
|
||||
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
int16x8_t s0, s1, s2, s3;
|
||||
load_s16_8x2(s, src_stride, &s0, &s2);
|
||||
load_s16_8x2(s + 8, src_stride, &s1, &s3);
|
||||
int16x4_t s0[8], s1[8], s2[8], s3[8];
|
||||
load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
|
||||
&s0[4], &s0[5], &s0[6], &s0[7]);
|
||||
load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
|
||||
&s1[4], &s1[5], &s1[6], &s1[7]);
|
||||
load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
|
||||
&s2[4], &s2[5], &s2[6], &s2[7]);
|
||||
load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
|
||||
&s3[4], &s3[5], &s3[6], &s3[7]);
|
||||
|
||||
uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
|
||||
uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
|
||||
uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
|
||||
s0[5], s0[6], s0[7], x_filter, max);
|
||||
uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
|
||||
s1[5], s1[6], s1[7], x_filter, max);
|
||||
uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
|
||||
s2[5], s2[6], s2[7], x_filter, max);
|
||||
uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
|
||||
s3[5], s3[6], s3[7], x_filter, max);
|
||||
|
||||
uint16x8_t d01 = vcombine_u16(d0, d1);
|
||||
d01 = vminq_u16(d01, max);
|
||||
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
|
||||
vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
|
||||
|
||||
s += 2 * src_stride;
|
||||
d += 2 * dst_stride;
|
||||
h -= 2;
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
int height = h;
|
||||
|
||||
do {
|
||||
int width = w;
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
int x_q4 = 0;
|
||||
|
||||
const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
|
||||
int16x8_t s0, s2, s4, s6;
|
||||
load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
|
||||
src_x += 8;
|
||||
|
||||
do {
|
||||
int16x8_t s1, s3, s5, s7;
|
||||
load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
|
||||
int16x8_t s0[8], s1[8], s2[8], s3[8];
|
||||
load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
|
||||
&s0[4], &s0[5], &s0[6], &s0[7]);
|
||||
load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
|
||||
&s1[4], &s1[5], &s1[6], &s1[7]);
|
||||
load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
|
||||
&s2[4], &s2[5], &s2[6], &s2[7]);
|
||||
load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
|
||||
&s3[4], &s3[5], &s3[6], &s3[7]);
|
||||
|
||||
uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
|
||||
uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
|
||||
uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
|
||||
uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
|
||||
|
||||
d0 = vminq_u16(d0, max);
|
||||
d1 = vminq_u16(d1, max);
|
||||
d2 = vminq_u16(d2, max);
|
||||
d3 = vminq_u16(d3, max);
|
||||
uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
|
||||
s0[5], s0[6], s0[7], x_filter, max);
|
||||
uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
|
||||
s1[5], s1[6], s1[7], x_filter, max);
|
||||
uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
|
||||
s2[5], s2[6], s2[7], x_filter, max);
|
||||
uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
|
||||
s3[5], s3[6], s3[7], x_filter, max);
|
||||
|
||||
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s1;
|
||||
s2 = s3;
|
||||
s4 = s5;
|
||||
s6 = s7;
|
||||
src_x += 8;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width > 0);
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_convolve_horiz_4tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
|
||||
|
||||
if (w == 4) {
|
||||
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
int16x4_t s0[4], s1[4], s2[4], s3[4];
|
||||
load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
|
||||
load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
|
||||
load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
|
||||
load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
|
||||
|
||||
uint16x4_t d0 =
|
||||
highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max);
|
||||
uint16x4_t d1 =
|
||||
highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max);
|
||||
uint16x4_t d2 =
|
||||
highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max);
|
||||
uint16x4_t d3 =
|
||||
highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max);
|
||||
|
||||
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
int height = h;
|
||||
|
||||
do {
|
||||
int width = w;
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
int16x8_t s0[4], s1[4], s2[4], s3[4];
|
||||
load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
|
||||
load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
|
||||
load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
|
||||
load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
|
||||
|
||||
uint16x8_t d0 =
|
||||
highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max);
|
||||
uint16x8_t d1 =
|
||||
highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max);
|
||||
uint16x8_t d2 =
|
||||
highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max);
|
||||
uint16x8_t d3 =
|
||||
highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max);
|
||||
|
||||
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
x_q4 += 8 * x_step_q4;
|
||||
} while (width > 0);
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
|
@ -236,21 +245,30 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
|
|||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
|
||||
|
||||
src -= SUBPEL_TAPS / 2 - 1;
|
||||
highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
|
||||
x_step_q4, w, h, bd);
|
||||
|
||||
const int filter_taps = get_filter_taps_convolve8(filter_x);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
|
||||
filter_x, w, h, bd);
|
||||
} else if (filter_taps == 4) {
|
||||
highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride,
|
||||
filter_x, w, h, bd);
|
||||
} else {
|
||||
highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride,
|
||||
filter_x, w, h, bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
const int16_t *y_filter_ptr, int w, int h,
|
||||
int bd) {
|
||||
static void highbd_convolve_vert_8tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
|
||||
if (w == 4) {
|
||||
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
|
@ -263,24 +281,15 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
|
|||
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
|
||||
|
||||
uint16x4_t d0 =
|
||||
highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
|
||||
highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
|
||||
uint16x4_t d1 =
|
||||
highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
|
||||
highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
|
||||
uint16x4_t d2 =
|
||||
highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
|
||||
highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
|
||||
uint16x4_t d3 =
|
||||
highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
|
||||
highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
|
||||
|
||||
uint16x8_t d01 = vcombine_u16(d0, d1);
|
||||
uint16x8_t d23 = vcombine_u16(d2, d3);
|
||||
|
||||
d01 = vminq_u16(d01, max);
|
||||
d23 = vminq_u16(d23, max);
|
||||
|
||||
vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
|
||||
vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
|
||||
vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
|
||||
vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
|
||||
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
|
@ -289,11 +298,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
|
|||
s4 = s8;
|
||||
s5 = s9;
|
||||
s6 = s10;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
|
||||
do {
|
||||
int height = h;
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
|
@ -307,19 +319,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
|
|||
int16x8_t s7, s8, s9, s10;
|
||||
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
|
||||
|
||||
uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
|
||||
s7, y_filter);
|
||||
uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
|
||||
s8, y_filter);
|
||||
uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
|
||||
s9, y_filter);
|
||||
uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
|
||||
s10, y_filter);
|
||||
|
||||
d0 = vminq_u16(d0, max);
|
||||
d1 = vminq_u16(d1, max);
|
||||
d2 = vminq_u16(d2, max);
|
||||
d3 = vminq_u16(d3, max);
|
||||
uint16x8_t d0 =
|
||||
highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
|
||||
uint16x8_t d1 =
|
||||
highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
|
||||
uint16x8_t d2 =
|
||||
highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
|
||||
uint16x8_t d3 =
|
||||
highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
|
||||
|
||||
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
|
@ -330,6 +337,7 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
|
|||
s4 = s8;
|
||||
s5 = s9;
|
||||
s6 = s10;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
|
@ -357,7 +365,18 @@ void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
|
|||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
|
||||
|
||||
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
|
||||
highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
|
||||
bd);
|
||||
|
||||
const int filter_taps = get_filter_taps_convolve8(filter_y);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
|
||||
dst_stride, filter_y, w, h, bd);
|
||||
} else if (filter_taps == 4) {
|
||||
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
|
||||
dst_stride, filter_y, w, h, bd);
|
||||
} else {
|
||||
highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y,
|
||||
w, h, bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,279 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
|
||||
#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
||||
static INLINE void highbd_convolve8_horiz_2tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
|
||||
// Bilinear filter values are all positive and multiples of 8. Divide by 8 to
|
||||
// reduce intermediate precision requirements and allow the use of non
|
||||
// widening multiply.
|
||||
const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
|
||||
const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
|
||||
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint16x8_t s0 =
|
||||
load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
|
||||
uint16x8_t s1 =
|
||||
load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
|
||||
uint16x8_t s2 =
|
||||
load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
|
||||
uint16x8_t s3 =
|
||||
load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);
|
||||
|
||||
uint16x8_t sum01 = vmulq_u16(s0, f0);
|
||||
sum01 = vmlaq_u16(sum01, s1, f1);
|
||||
uint16x8_t sum23 = vmulq_u16(s2, f0);
|
||||
sum23 = vmlaq_u16(sum23, s3, f1);
|
||||
|
||||
// We divided filter taps by 8 so subtract 3 from right shift.
|
||||
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
|
||||
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
|
||||
|
||||
sum01 = vminq_u16(sum01, max);
|
||||
sum23 = vminq_u16(sum23, max);
|
||||
|
||||
store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
|
||||
store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
|
||||
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint16_t *s = src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
|
||||
uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
|
||||
uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
|
||||
uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);
|
||||
|
||||
uint16x8_t sum01 = vmulq_u16(s0, f0);
|
||||
sum01 = vmlaq_u16(sum01, s1, f1);
|
||||
uint16x8_t sum23 = vmulq_u16(s2, f0);
|
||||
sum23 = vmlaq_u16(sum23, s3, f1);
|
||||
|
||||
// We divided filter taps by 8 so subtract 3 from right shift.
|
||||
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
|
||||
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
|
||||
|
||||
sum01 = vminq_u16(sum01, max);
|
||||
sum23 = vminq_u16(sum23, max);
|
||||
|
||||
vst1q_u16(d + 0 * dst_stride, sum01);
|
||||
vst1q_u16(d + 1 * dst_stride, sum23);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint16x4_t highbd_convolve4_4(
|
||||
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
|
||||
const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
|
||||
int32x4_t sum = vmull_lane_s16(s0, filter, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, filter, 3);
|
||||
|
||||
uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
|
||||
|
||||
return vmin_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE uint16x8_t highbd_convolve4_8(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
|
||||
const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
|
||||
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
|
||||
|
||||
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
|
||||
|
||||
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
|
||||
vqrshrun_n_s32(sum1, FILTER_BITS));
|
||||
|
||||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE void highbd_convolve8_vert_4tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
|
||||
|
||||
if (w == 4) {
|
||||
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
int16x4_t s0, s1, s2;
|
||||
load_s16_4x3(s, src_stride, &s0, &s1, &s2);
|
||||
s += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x4_t s3, s4, s5, s6;
|
||||
load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max);
|
||||
uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max);
|
||||
uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max);
|
||||
uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max);
|
||||
|
||||
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
|
||||
do {
|
||||
int height = h;
|
||||
const int16_t *s = (const int16_t *)src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
int16x8_t s0, s1, s2;
|
||||
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
|
||||
s += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x8_t s3, s4, s5, s6;
|
||||
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max);
|
||||
uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max);
|
||||
uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max);
|
||||
uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max);
|
||||
|
||||
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height > 0);
|
||||
src_ptr += 8;
|
||||
dst_ptr += 8;
|
||||
w -= 8;
|
||||
} while (w > 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void highbd_convolve8_vert_2tap_neon(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
|
||||
// Bilinear filter values are all positive and multiples of 8. Divide by 8 to
|
||||
// reduce intermediate precision requirements and allow the use of non
|
||||
// widening multiply.
|
||||
const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
|
||||
const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
|
||||
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint16x8_t s0 =
|
||||
load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
|
||||
uint16x8_t s1 =
|
||||
load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
|
||||
uint16x8_t s2 =
|
||||
load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
|
||||
uint16x8_t s3 =
|
||||
load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
|
||||
|
||||
uint16x8_t sum01 = vmulq_u16(s0, f0);
|
||||
sum01 = vmlaq_u16(sum01, s1, f1);
|
||||
uint16x8_t sum23 = vmulq_u16(s2, f0);
|
||||
sum23 = vmlaq_u16(sum23, s3, f1);
|
||||
|
||||
// We divided filter taps by 8 so subtract 3 from right shift.
|
||||
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
|
||||
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
|
||||
|
||||
sum01 = vminq_u16(sum01, max);
|
||||
sum23 = vminq_u16(sum23, max);
|
||||
|
||||
store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
|
||||
store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
|
||||
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint16_t *s = src_ptr;
|
||||
uint16_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
uint16x8_t s0, s1, s2;
|
||||
load_u16_8x3(s, src_stride, &s0, &s1, &s2);
|
||||
|
||||
uint16x8_t sum01 = vmulq_u16(s0, f0);
|
||||
sum01 = vmlaq_u16(sum01, s1, f1);
|
||||
uint16x8_t sum23 = vmulq_u16(s1, f0);
|
||||
sum23 = vmlaq_u16(sum23, s2, f1);
|
||||
|
||||
// We divided filter taps by 8 so subtract 3 from right shift.
|
||||
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
|
||||
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
|
||||
|
||||
sum01 = vminq_u16(sum01, max);
|
||||
sum23 = vminq_u16(sum23, max);
|
||||
|
||||
vst1q_u16(d + 0 * dst_stride, sum01);
|
||||
vst1q_u16(d + 1 * dst_stride, sum23);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/highbd_convolve8_neon.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
||||
static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
|
||||
|
@ -252,7 +253,12 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
|
|||
|
||||
src -= SUBPEL_TAPS / 2 - 1;
|
||||
|
||||
if (get_filter_taps_convolve8(filter_x) <= 4) {
|
||||
const int filter_taps = get_filter_taps_convolve8(filter_x);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
|
||||
filter_x, width, height, bd);
|
||||
} else if (filter_taps == 4) {
|
||||
highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride,
|
||||
filter_x, width, height, bd);
|
||||
} else {
|
||||
|
@ -534,134 +540,13 @@ static INLINE void highbd_convolve8_vert_8tap_sve(
|
|||
}
|
||||
}
|
||||
|
||||
static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter,
|
||||
uint16x4_t max) {
|
||||
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
|
||||
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
|
||||
|
||||
int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
|
||||
uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
|
||||
|
||||
return vmin_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter,
|
||||
uint16x8_t max) {
|
||||
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
|
||||
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
|
||||
int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
|
||||
int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
|
||||
|
||||
int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
|
||||
int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
|
||||
|
||||
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS),
|
||||
vqrshrun_n_s32(s4567, FILTER_BITS));
|
||||
|
||||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
static INLINE void highbd_convolve8_vert_4tap_sve(
|
||||
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
|
||||
int bd) {
|
||||
const int16x8_t y_filter =
|
||||
vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
|
||||
|
||||
uint8x16_t merge_block_tbl[3];
|
||||
merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
|
||||
merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
|
||||
merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
|
||||
|
||||
if (width == 4) {
|
||||
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
|
||||
int16_t *s = (int16_t *)src;
|
||||
|
||||
int16x4_t s0, s1, s2;
|
||||
load_s16_4x3(s, src_stride, &s0, &s1, &s2);
|
||||
s += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x4_t s3, s4, s5, s6;
|
||||
load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
// This operation combines a conventional transpose and the sample permute
|
||||
// required before computing the dot product.
|
||||
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
|
||||
transpose_concat_4x4(s0, s1, s2, s3, s0123);
|
||||
transpose_concat_4x4(s1, s2, s3, s4, s1234);
|
||||
transpose_concat_4x4(s2, s3, s4, s5, s2345);
|
||||
transpose_concat_4x4(s3, s4, s5, s6, s3456);
|
||||
|
||||
uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max);
|
||||
uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max);
|
||||
uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max);
|
||||
uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max);
|
||||
|
||||
store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
// Shuffle everything up four rows.
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height != 0);
|
||||
} else {
|
||||
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
|
||||
do {
|
||||
int h = height;
|
||||
int16_t *s = (int16_t *)src;
|
||||
uint16_t *d = dst;
|
||||
|
||||
int16x8_t s0, s1, s2;
|
||||
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
|
||||
s += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x8_t s3, s4, s5, s6;
|
||||
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
// This operation combines a conventional transpose and the sample
|
||||
// permute required before computing the dot product.
|
||||
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
|
||||
transpose_concat_8x4(s0, s1, s2, s3, s0123);
|
||||
transpose_concat_8x4(s1, s2, s3, s4, s1234);
|
||||
transpose_concat_8x4(s2, s3, s4, s5, s2345);
|
||||
transpose_concat_8x4(s3, s4, s5, s6, s3456);
|
||||
|
||||
uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max);
|
||||
uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max);
|
||||
uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max);
|
||||
uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max);
|
||||
|
||||
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
// Shuffle everything up four rows.
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h != 0);
|
||||
src += 8;
|
||||
dst += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
|
||||
uint8_t *dst8, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int width, int height, int bd) {
|
||||
assert(y_step_q4 == 16);
|
||||
assert(w >= 4 && h >= 4);
|
||||
assert(width >= 4 && height >= 4);
|
||||
(void)filter_x;
|
||||
(void)y_step_q4;
|
||||
(void)x_step_q4;
|
||||
|
@ -671,9 +556,14 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
|
|||
|
||||
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
|
||||
|
||||
if (get_filter_taps_convolve8(filter_y) <= 4) {
|
||||
highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst,
|
||||
dst_stride, filter_y, width, height, bd);
|
||||
const int filter_taps = get_filter_taps_convolve8(filter_y);
|
||||
|
||||
if (filter_taps == 2) {
|
||||
highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
|
||||
dst_stride, filter_y, width, height, bd);
|
||||
} else if (filter_taps == 4) {
|
||||
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
|
||||
dst_stride, filter_y, width, height, bd);
|
||||
} else {
|
||||
highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y,
|
||||
width, height, bd);
|
||||
|
|
|
@ -1201,7 +1201,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)
|
|||
|
||||
// For width 16 and above.
|
||||
#define HIGHBD_SMOOTH_H_PREDICTOR(W) \
|
||||
void highbd_smooth_h_##W##xh_neon( \
|
||||
static void highbd_smooth_h_##W##xh_neon( \
|
||||
uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
|
||||
const uint16_t *const left_column, const int height) { \
|
||||
const uint16_t top_right = top_row[(W)-1]; \
|
||||
|
@ -1293,6 +1293,33 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
|
|||
highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
static const uint8_t kLoadMaxShuffles[] = {
|
||||
14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
|
||||
12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
|
||||
10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
|
||||
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
|
||||
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
|
||||
int shuffle_idx) {
|
||||
uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
|
||||
uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
|
||||
#if AOM_ARCH_AARCH64
|
||||
return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
|
||||
#else
|
||||
uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
|
||||
uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
|
||||
uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
|
||||
return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
|
||||
ptrdiff_t stride, int bw,
|
||||
int bh,
|
||||
|
@ -1336,13 +1363,26 @@ static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
|
|||
} else {
|
||||
int c = 0;
|
||||
do {
|
||||
const uint16x8_t a0 = vld1q_u16(&above[base + c]);
|
||||
const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
|
||||
const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
|
||||
const uint16x8_t cmp =
|
||||
vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
|
||||
const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
|
||||
vst1q_u16(dst + c, res);
|
||||
uint16x8_t a0;
|
||||
uint16x8_t a1;
|
||||
if (base + c >= max_base_x) {
|
||||
a0 = a1 = vdupq_n_u16(above_max);
|
||||
} else {
|
||||
if (base + c + 7 >= max_base_x) {
|
||||
int shuffle_idx = max_base_x - base - c;
|
||||
a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
|
||||
} else {
|
||||
a0 = vld1q_u16(above + base + c);
|
||||
}
|
||||
if (base + c + 8 >= max_base_x) {
|
||||
int shuffle_idx = max_base_x - base - c - 1;
|
||||
a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
|
||||
} else {
|
||||
a1 = vld1q_u16(above + base + c + 1);
|
||||
}
|
||||
}
|
||||
|
||||
vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
|
||||
c += 8;
|
||||
} while (c < bw);
|
||||
}
|
||||
|
@ -2456,13 +2496,29 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
|
|||
val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \
|
||||
uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
|
||||
val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \
|
||||
const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \
|
||||
const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
|
||||
vrshrn_n_u32(val_hi, (shift))); \
|
||||
*(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \
|
||||
vdupq_n_u16(left_max)); \
|
||||
*(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
|
||||
vrshrn_n_u32(val_hi, (shift))); \
|
||||
} while (0)
|
||||
|
||||
static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
|
||||
int max_ofs) {
|
||||
uint16x8_t r0;
|
||||
uint16x8_t r1;
|
||||
if (ofs + 7 >= max_ofs) {
|
||||
int shuffle_idx = max_ofs - ofs;
|
||||
r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
|
||||
} else {
|
||||
r0 = vld1q_u16(left0 + ofs);
|
||||
}
|
||||
if (ofs + 8 >= max_ofs) {
|
||||
int shuffle_idx = max_ofs - ofs - 1;
|
||||
r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
|
||||
} else {
|
||||
r1 = vld1q_u16(left0 + ofs + 1);
|
||||
}
|
||||
return (uint16x8x2_t){ { r0, r1 } };
|
||||
}
|
||||
|
||||
static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
|
||||
ptrdiff_t stride, int bw,
|
||||
int bh, const uint16_t *left,
|
||||
|
@ -2561,34 +2617,30 @@ static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
|
|||
if (base0 >= max_base_y) {
|
||||
out[0] = vdupq_n_u16(left_max);
|
||||
} else {
|
||||
const uint16x8_t l00 = vld1q_u16(left + base0);
|
||||
const uint16x8_t l01 = vld1q_u16(left1 + base0);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
|
||||
shifts0, shifts1, 0, 6);
|
||||
const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
|
||||
l0.val[1], shifts0, shifts1, 0, 6);
|
||||
}
|
||||
if (base1 >= max_base_y) {
|
||||
out[1] = vdupq_n_u16(left_max);
|
||||
} else {
|
||||
const uint16x8_t l10 = vld1q_u16(left + base1);
|
||||
const uint16x8_t l11 = vld1q_u16(left1 + base1);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
|
||||
shifts0, shifts1, 1, 6);
|
||||
const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
|
||||
l1.val[1], shifts0, shifts1, 1, 6);
|
||||
}
|
||||
if (base2 >= max_base_y) {
|
||||
out[2] = vdupq_n_u16(left_max);
|
||||
} else {
|
||||
const uint16x8_t l20 = vld1q_u16(left + base2);
|
||||
const uint16x8_t l21 = vld1q_u16(left1 + base2);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
|
||||
shifts0, shifts1, 2, 6);
|
||||
const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
|
||||
l2.val[1], shifts0, shifts1, 2, 6);
|
||||
}
|
||||
if (base3 >= max_base_y) {
|
||||
out[3] = vdupq_n_u16(left_max);
|
||||
} else {
|
||||
const uint16x8_t l30 = vld1q_u16(left + base3);
|
||||
const uint16x8_t l31 = vld1q_u16(left1 + base3);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
|
||||
shifts0, shifts1, 3, 6);
|
||||
const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
|
||||
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
|
||||
l3.val[1], shifts0, shifts1, 3, 6);
|
||||
}
|
||||
transpose_array_inplace_u16_4x8(out);
|
||||
for (int r2 = 0; r2 < 4; ++r2) {
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <string.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/quantize.h"
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
@ -1356,6 +1357,41 @@ static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
|
|||
}
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
static const uint8_t kLoadMaxShuffles[] = {
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
|
||||
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
|
||||
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
|
||||
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15,
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
|
||||
int shuffle_idx) {
|
||||
uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
|
||||
uint8x16_t src = vld1q_u8(ptr);
|
||||
#if AOM_ARCH_AARCH64
|
||||
return vqtbl1q_u8(src, shuffle);
|
||||
#else
|
||||
uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
|
||||
uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
|
||||
uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
|
||||
return vcombine_u8(lo, hi);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, int dx) {
|
||||
const int frac_bits = 6;
|
||||
|
@ -1369,7 +1405,6 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
|
|||
// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
|
||||
|
||||
const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
|
||||
const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
|
||||
|
||||
int x = dx;
|
||||
for (int r = 0; r < N; r++, dst += stride) {
|
||||
|
@ -1391,12 +1426,24 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
|
|||
vcreate_u8(0x0F0E0D0C0B0A0908)));
|
||||
|
||||
for (int j = 0; j < 64; j += 16) {
|
||||
int mdif = max_base_x - (base + j);
|
||||
if (mdif <= 0) {
|
||||
if (base + j >= max_base_x) {
|
||||
vst1q_u8(dst + j, a_mbase_x);
|
||||
} else {
|
||||
uint8x16_t a0_128 = vld1q_u8(above + base + j);
|
||||
uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
|
||||
uint8x16_t a0_128;
|
||||
uint8x16_t a1_128;
|
||||
if (base + j + 15 >= max_base_x) {
|
||||
int shuffle_idx = max_base_x - base - j;
|
||||
a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
|
||||
} else {
|
||||
a0_128 = vld1q_u8(above + base + j);
|
||||
}
|
||||
if (base + j + 16 >= max_base_x) {
|
||||
int shuffle_idx = max_base_x - base - j - 1;
|
||||
a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
|
||||
} else {
|
||||
a1_128 = vld1q_u8(above + base + j + 1);
|
||||
}
|
||||
|
||||
uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
|
||||
uint16x8_t diff_hi =
|
||||
vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
|
||||
|
@ -1406,13 +1453,8 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
|
|||
vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
|
||||
uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
|
||||
uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
|
||||
uint8x16_t v_temp =
|
||||
vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
|
||||
|
||||
uint8x16_t mask128 =
|
||||
vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
|
||||
uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
|
||||
vst1q_u8(dst + j, res128);
|
||||
vst1q_u8(dst + j,
|
||||
vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
|
||||
|
||||
base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
|
||||
}
|
||||
|
|
|
@ -174,6 +174,16 @@ static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
|
|||
*s3 = vld1_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
|
||||
uint8x8_t *const s0, uint8x8_t *const s1,
|
||||
uint8x8_t *const s2) {
|
||||
*s0 = vld1_u8(s);
|
||||
s += p;
|
||||
*s1 = vld1_u8(s);
|
||||
s += p;
|
||||
*s2 = vld1_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
|
||||
uint16x4_t *const s0, uint16x4_t *const s1,
|
||||
uint16x4_t *const s2, uint16x4_t *const s3) {
|
||||
|
@ -221,6 +231,16 @@ static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
|
|||
*s1 = vld1q_u16(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
|
||||
uint16x8_t *const s0, uint16x8_t *const s1,
|
||||
uint16x8_t *const s2) {
|
||||
*s0 = vld1q_u16(s);
|
||||
s += p;
|
||||
*s1 = vld1q_u16(s);
|
||||
s += p;
|
||||
*s2 = vld1q_u16(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
|
||||
uint16x8_t *const s0, uint16x8_t *const s1,
|
||||
uint16x8_t *const s2, uint16x8_t *const s3) {
|
||||
|
@ -634,6 +654,13 @@ static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
|
|||
vst1q_s16(s, s3);
|
||||
}
|
||||
|
||||
static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
|
||||
const int16x8_t s0, const int16x8_t s1) {
|
||||
vst1q_s16(s, s0);
|
||||
s += dst_stride;
|
||||
vst1q_s16(s, s1);
|
||||
}
|
||||
|
||||
static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
|
||||
uint8x8_t *const s0, uint8x8_t *const s1,
|
||||
uint8x8_t *const s2, uint8x8_t *const s3,
|
||||
|
@ -1026,6 +1053,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
|
|||
*s7 = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
|
||||
uint8x16_t *const s0, uint8x16_t *const s1,
|
||||
uint8x16_t *const s2, uint8x16_t *const s3,
|
||||
uint8x16_t *const s4) {
|
||||
*s0 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s1 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s2 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s3 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s4 = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
|
||||
uint8x16_t *const s0, uint8x16_t *const s1,
|
||||
uint8x16_t *const s2, uint8x16_t *const s3) {
|
||||
|
@ -1038,6 +1080,16 @@ static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
|
|||
*s3 = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
|
||||
uint8x16_t *const s0, uint8x16_t *const s1,
|
||||
uint8x16_t *const s2) {
|
||||
*s0 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s1 = vld1q_u8(s);
|
||||
s += p;
|
||||
*s2 = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
|
||||
uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
|
||||
uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
|
||||
|
@ -1228,6 +1280,12 @@ static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
|
|||
memcpy(dst, &a, 8); \
|
||||
} while (0)
|
||||
|
||||
#define store_s16_4x1_lane(dst, src, lane) \
|
||||
do { \
|
||||
int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
|
||||
memcpy(dst, &a, 8); \
|
||||
} while (0)
|
||||
|
||||
// Store the low 16-bits from a single vector.
|
||||
static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
|
||||
store_u8_2x1_lane(dst, src, 0);
|
||||
|
@ -1287,9 +1345,18 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
|
|||
store_u16_4x1_lane(dst, src, 1);
|
||||
}
|
||||
|
||||
// Store two blocks of 64-bits from a single vector.
|
||||
static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
|
||||
int16x8_t src) {
|
||||
store_s16_4x1_lane(dst, src, 0);
|
||||
dst += dst_stride;
|
||||
store_s16_4x1_lane(dst, src, 1);
|
||||
}
|
||||
|
||||
#undef store_u8_2x1_lane
|
||||
#undef store_u8_4x1_lane
|
||||
#undef store_u16_2x1_lane
|
||||
#undef store_u16_4x1_lane
|
||||
#undef store_s16_4x1_lane
|
||||
|
||||
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <arm_neon.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/x86/convolve.h"
|
||||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_AV1_HIGHBITDEPTH
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
|
||||
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
|
||||
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
|
||||
|
||||
// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
|
||||
// ptrdiff_t src_stride,
|
||||
// uint8_t *dst,
|
||||
// ptrdiff_t dst_stride,
|
||||
// const int16_t *filter_x,
|
||||
// int x_step_q4,
|
||||
// const int16_t *filter_y,
|
||||
// int y_step_q4,
|
||||
// int w, int h, int bd);
|
||||
// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
|
||||
// ptrdiff_t src_stride,
|
||||
// uint8_t *dst,
|
||||
// ptrdiff_t dst_stride,
|
||||
// const int16_t *filter_x,
|
||||
// int x_step_q4,
|
||||
// const int16_t *filter_y,
|
||||
// int y_step_q4,
|
||||
// int w, int h, int bd);
|
||||
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
|
||||
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
|
||||
#endif
|
||||
#endif // HAVE_SSE2
|
|
@ -202,14 +202,15 @@
|
|||
|
||||
SECTION .text
|
||||
|
||||
;void aom_filter_block1d4_v8_sse2
|
||||
;void aom_highbd_filter_block1d4_v8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pitch,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int out_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d4_v8_sse2)
|
||||
sym(aom_highbd_filter_block1d4_v8_sse2):
|
||||
|
@ -272,14 +273,15 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void aom_filter_block1d8_v8_sse2
|
||||
;void aom_highbd_filter_block1d8_v8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pitch,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int out_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d8_v8_sse2)
|
||||
sym(aom_highbd_filter_block1d8_v8_sse2):
|
||||
|
@ -331,14 +333,15 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void aom_filter_block1d16_v8_sse2
|
||||
;void aom_highbd_filter_block1d16_v8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pitch,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int out_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d16_v8_sse2)
|
||||
sym(aom_highbd_filter_block1d16_v8_sse2):
|
||||
|
@ -394,14 +397,15 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void aom_filter_block1d4_h8_sse2
|
||||
;void aom_highbd_filter_block1d4_h8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pixels_per_line,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int output_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d4_h8_sse2)
|
||||
sym(aom_highbd_filter_block1d4_h8_sse2):
|
||||
|
@ -469,14 +473,15 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void aom_filter_block1d8_h8_sse2
|
||||
;void aom_highbd_filter_block1d8_h8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pixels_per_line,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int output_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d8_h8_sse2)
|
||||
sym(aom_highbd_filter_block1d8_h8_sse2):
|
||||
|
@ -535,14 +540,15 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void aom_filter_block1d16_h8_sse2
|
||||
;void aom_highbd_filter_block1d16_h8_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned int src_pixels_per_line,
|
||||
; unsigned char *output_ptr,
|
||||
; unsigned int output_pitch,
|
||||
; unsigned int output_height,
|
||||
; short *filter
|
||||
; const uint16_t *src_ptr,
|
||||
; const ptrdiff_t src_pitch,
|
||||
; uint16_t *output_ptr,
|
||||
; ptrdiff_t out_pitch,
|
||||
; unsigned int output_height,
|
||||
; const int16_t *filter,
|
||||
; int bd
|
||||
;)
|
||||
globalsym(aom_highbd_filter_block1d16_h8_sse2)
|
||||
sym(aom_highbd_filter_block1d16_h8_sse2):
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
|
||||
#include "aom_dsp/x86/mem_sse2.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
|
||||
|
@ -171,10 +172,8 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
|
|||
__m128i s0, s1, u0;
|
||||
unsigned int avg = 0;
|
||||
u0 = _mm_setzero_si128();
|
||||
s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
|
||||
_mm_cvtsi32_si128(*(const int *)(s + p)));
|
||||
s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
|
||||
_mm_cvtsi32_si128(*(const int *)(s + p * 3)));
|
||||
s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p));
|
||||
s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3));
|
||||
s0 = _mm_sad_epu8(s0, u0);
|
||||
s1 = _mm_sad_epu8(s1, u0);
|
||||
s0 = _mm_add_epi16(s0, s1);
|
||||
|
|
|
@ -15,10 +15,9 @@
|
|||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d4_v4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
__m128i filtersReg;
|
||||
__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
|
||||
__m128i srcReg23_lo, srcReg34_lo;
|
||||
|
@ -101,10 +100,9 @@ void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d4_h4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
__m128i filtersReg;
|
||||
__m128i addFilterReg64;
|
||||
__m128i secondFilters, thirdFilters;
|
||||
|
@ -153,10 +151,9 @@ void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d8_v4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
__m128i filtersReg;
|
||||
__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
|
||||
__m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
|
||||
|
@ -262,10 +259,9 @@ void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d8_h4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
__m128i filtersReg;
|
||||
__m128i addFilterReg64;
|
||||
__m128i secondFilters, thirdFilters;
|
||||
|
@ -330,22 +326,57 @@ void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d16_v4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
|
||||
height, filter, bd);
|
||||
aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
|
||||
dst_pitch, height, filter, bd);
|
||||
}
|
||||
|
||||
void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
|
||||
ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height,
|
||||
const int16_t *filter, int bd) {
|
||||
static void aom_highbd_filter_block1d16_h4_sse2(
|
||||
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
|
||||
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
|
||||
aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
|
||||
height, filter, bd);
|
||||
aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
|
||||
dst_pitch, height, filter, bd);
|
||||
}
|
||||
|
||||
// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
|
||||
|
||||
// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
|
||||
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
|
||||
|
||||
// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
|
||||
// ptrdiff_t src_stride,
|
||||
// uint8_t *dst,
|
||||
// ptrdiff_t dst_stride,
|
||||
// const int16_t *filter_x,
|
||||
// int x_step_q4,
|
||||
// const int16_t *filter_y,
|
||||
// int y_step_q4,
|
||||
// int w, int h, int bd);
|
||||
// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
|
||||
// ptrdiff_t src_stride,
|
||||
// uint8_t *dst,
|
||||
// ptrdiff_t dst_stride,
|
||||
// const int16_t *filter_x,
|
||||
// int x_step_q4,
|
||||
// const int16_t *filter_y,
|
||||
// int y_step_q4,
|
||||
// int w, int h, int bd);
|
||||
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
|
||||
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
|
||||
|
|
|
@ -551,7 +551,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
|
|||
static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
|
||||
uint32_t *res) {
|
||||
__m256i u0, u1, u2, u3;
|
||||
const __m256i mask = yy_set1_64_from_32i(~0);
|
||||
const __m256i mask = _mm256_set1_epi64x(~0u);
|
||||
__m128i sad;
|
||||
|
||||
// 8 32-bit summation
|
||||
|
|
|
@ -17,16 +17,7 @@
|
|||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
|
||||
void aom_var_filter_block2d_bil_first_pass_ssse3(
|
||||
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
|
||||
void aom_var_filter_block2d_bil_second_pass_ssse3(
|
||||
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
#include "aom_dsp/x86/variance_impl_ssse3.h"
|
||||
|
||||
static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
|
||||
const __m128i *w, const __m128i *r,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <tmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <smmintrin.h>
|
||||
|
||||
#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
|
||||
static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
|
||||
const int32_t *wsrc, const int32_t *mask,
|
||||
|
@ -28,7 +29,7 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
|
|||
assert(IS_POWER_OF_TWO(h));
|
||||
|
||||
do {
|
||||
const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
|
||||
const __m128i v_p_b = xx_loadl_32(pre + n);
|
||||
const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
|
||||
const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
|
||||
|
||||
|
|
|
@ -22,21 +22,12 @@
|
|||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_dsp/x86/variance_impl_ssse3.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// 8 bit
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void aom_var_filter_block2d_bil_first_pass_ssse3(
|
||||
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
|
||||
void aom_var_filter_block2d_bil_second_pass_ssse3(
|
||||
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
|
||||
static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
|
||||
const int32_t *wsrc, const int32_t *mask,
|
||||
unsigned int *const sse, int *const sum,
|
||||
|
|
|
@ -21,7 +21,7 @@ static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
|
|||
int width, int height) {
|
||||
uint64_t result;
|
||||
__m256i v_acc_q = _mm256_setzero_si256();
|
||||
const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
|
||||
const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u);
|
||||
for (int col = 0; col < height; col += 4) {
|
||||
__m256i v_acc_d = _mm256_setzero_si256();
|
||||
for (int row = 0; row < width; row += 16) {
|
||||
|
|
|
@ -84,7 +84,7 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
|
|||
src += stride << 2;
|
||||
r += 4;
|
||||
} while (r < height);
|
||||
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
|
||||
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
|
||||
__m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
|
||||
_mm_and_si128(v_acc_q, v_zext_mask_q));
|
||||
v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
|
||||
|
@ -116,7 +116,7 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
|
|||
int height) {
|
||||
int r = 0;
|
||||
|
||||
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
|
||||
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
|
||||
__m128i v_acc_q = _mm_setzero_si128();
|
||||
|
||||
do {
|
||||
|
@ -254,7 +254,7 @@ uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
|
|||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
|
||||
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
|
||||
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
|
||||
__m128i v_acc0_q = _mm_setzero_si128();
|
||||
__m128i v_acc1_q = _mm_setzero_si128();
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
|
||||
#define AOM_AOM_DSP_X86_SYNONYMS_H_
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
|
@ -46,23 +46,13 @@ static INLINE __m128i xx_loadu_128(const void *a) {
|
|||
return _mm_loadu_si128((const __m128i *)a);
|
||||
}
|
||||
|
||||
// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
|
||||
// manually on older compilers.
|
||||
#if !defined(__clang__) && __GNUC_MAJOR__ < 9
|
||||
static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
|
||||
__m64 hi_, lo_;
|
||||
memcpy(&hi_, hi, sizeof(hi_));
|
||||
memcpy(&lo_, lo, sizeof(lo_));
|
||||
return _mm_set_epi64(hi_, lo_);
|
||||
}
|
||||
#else
|
||||
// Load 64 bits from each of hi and low, and pack into an SSE register
|
||||
// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
|
||||
// the strict aliasing rule, this takes a different approach
|
||||
static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
|
||||
return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
|
||||
return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
|
||||
_mm_loadl_epi64((const __m128i *)hi));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE void xx_storel_32(void *const a, const __m128i v) {
|
||||
const int val = _mm_cvtsi128_si32(v);
|
||||
|
@ -81,28 +71,6 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
|
|||
_mm_storeu_si128((__m128i *)a, v);
|
||||
}
|
||||
|
||||
// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
|
||||
// compilers. The following function is equivalent to _mm_set_epi64x()
|
||||
// acting on 32-bit integers.
|
||||
static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1900
|
||||
return _mm_set_epi32(0, e1, 0, e0);
|
||||
#else
|
||||
return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
|
||||
// compilers. The following function is equivalent to _mm_set1_epi64x()
|
||||
// acting on a 32-bit integer.
|
||||
static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1900
|
||||
return _mm_set_epi32(0, a, 0, a);
|
||||
#else
|
||||
return _mm_set1_epi64x((uint32_t)a);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Fill an SSE register using an interleaved pair of values, ie. set the
|
||||
// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
|
||||
// as when a register is stored to / loaded from memory.
|
||||
|
|
|
@ -53,17 +53,6 @@ static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
|
|||
return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
|
||||
}
|
||||
|
||||
// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
|
||||
// compilers. The following function is equivalent to _mm256_set1_epi64x()
|
||||
// acting on a 32-bit integer.
|
||||
static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
|
||||
#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
|
||||
return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
|
||||
#else
|
||||
return _mm256_set1_epi64x((uint32_t)a);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
|
||||
// therefore define an equivalent function using a different intrinsic.
|
||||
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
|
||||
|
@ -71,26 +60,11 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
|
|||
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
|
||||
}
|
||||
|
||||
#define GCC_VERSION (__GNUC__ * 10000 \
|
||||
+ __GNUC_MINOR__ * 100 \
|
||||
+ __GNUC_PATCHLEVEL__)
|
||||
|
||||
// _mm256_loadu2_m128i has been introduced in GCC 10.1
|
||||
#if !defined(__clang__) && GCC_VERSION < 101000
|
||||
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
|
||||
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
|
||||
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
|
||||
return _mm256_set_m128i(mhi, mlo);
|
||||
}
|
||||
#else
|
||||
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
|
||||
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
|
||||
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
|
||||
return yy_set_m128i(mhi, mlo);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef GCC_VERSION
|
||||
|
||||
static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
|
||||
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "config/aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_dsp/x86/variance_impl_ssse3.h"
|
||||
|
||||
void aom_var_filter_block2d_bil_first_pass_ssse3(
|
||||
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
|
||||
#define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void aom_var_filter_block2d_bil_first_pass_ssse3(
|
||||
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
|
||||
void aom_var_filter_block2d_bil_second_pass_ssse3(
|
||||
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step, unsigned int output_height,
|
||||
unsigned int output_width, const uint8_t *filter);
|
||||
|
||||
#endif // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
|
|
@ -44,7 +44,7 @@ static int arm_get_cpu_caps(void) {
|
|||
return flags;
|
||||
}
|
||||
|
||||
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
|
||||
#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
|
||||
|
||||
static int arm_get_cpu_caps(void) {
|
||||
int flags = 0;
|
||||
|
|
|
@ -89,7 +89,7 @@ static int arm_get_cpu_caps(void) {
|
|||
return flags;
|
||||
}
|
||||
|
||||
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
|
||||
#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
|
||||
|
||||
static int arm_get_cpu_caps(void) {
|
||||
int flags = 0;
|
||||
|
|
|
@ -18,7 +18,7 @@ list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
|
|||
"${AOM_ROOT}/aom_ports/emmintrin_compat.h"
|
||||
"${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
|
||||
"${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
|
||||
"${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
|
||||
"${AOM_ROOT}/aom_ports/sanitizer.h")
|
||||
|
||||
list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
#endif
|
||||
|
||||
#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
|
||||
#define ANDROID_USE_CPU_FEATURES_LIB 1
|
||||
#define AOM_USE_ANDROID_CPU_FEATURES 1
|
||||
// Use getauxval() when targeting (64-bit) Android with API level >= 18.
|
||||
// getauxval() is supported since Android API level 18 (Android 4.3.)
|
||||
// First Android version with 64-bit support was Android 5.x (API level 21).
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "aom_ports/msvc.h"
|
||||
#include "config/aom_config.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AOM_PORTS_MSVC_H_
|
||||
#define AOM_AOM_PORTS_MSVC_H_
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#include "config/aom_config.h"
|
||||
|
||||
#if _MSC_VER < 1900 // VS2015 provides snprintf
|
||||
#define snprintf _snprintf
|
||||
#endif // _MSC_VER < 1900
|
||||
|
||||
#if _MSC_VER < 1800 // VS2013 provides round
|
||||
#include <math.h>
|
||||
static INLINE double round(double x) {
|
||||
if (x < 0)
|
||||
return ceil(x - 0.5);
|
||||
else
|
||||
return floor(x + 0.5);
|
||||
}
|
||||
|
||||
static INLINE float roundf(float x) {
|
||||
if (x < 0)
|
||||
return (float)ceil(x - 0.5f);
|
||||
else
|
||||
return (float)floor(x + 0.5f);
|
||||
}
|
||||
|
||||
static INLINE long lroundf(float x) {
|
||||
if (x < 0)
|
||||
return (long)(x - 0.5f);
|
||||
else
|
||||
return (long)(x + 0.5f);
|
||||
}
|
||||
#endif // _MSC_VER < 1800
|
||||
|
||||
#if HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
// Note:
|
||||
// _mm256_insert_epi16 intrinsics is available from vs2017.
|
||||
// We define this macro for vs2015 and earlier. The
|
||||
// intrinsics used here are in vs2015 document:
|
||||
// https://msdn.microsoft.com/en-us/library/hh977022.aspx
|
||||
// Input parameters:
|
||||
// a: __m256i,
|
||||
// d: int16_t,
|
||||
// indx: imm8 (0 - 15)
|
||||
#if _MSC_VER <= 1900
|
||||
#define _mm256_insert_epi16(a, d, indx) \
|
||||
_mm256_insertf128_si256( \
|
||||
a, \
|
||||
_mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
|
||||
indx >> 3)
|
||||
|
||||
static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
|
||||
return a.m256i_i32[i & 7];
|
||||
}
|
||||
static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
|
||||
__m256i c = a;
|
||||
c.m256i_i32[i & 7] = b;
|
||||
return c;
|
||||
}
|
||||
#endif // _MSC_VER <= 1900
|
||||
#endif // HAVE_AVX
|
||||
#endif // _MSC_VER
|
||||
#endif // AOM_AOM_PORTS_MSVC_H_
|
|
@ -36,8 +36,6 @@ typedef HANDLE pthread_t;
|
|||
typedef int pthread_attr_t;
|
||||
typedef CRITICAL_SECTION pthread_mutex_t;
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#if _WIN32_WINNT < 0x0600
|
||||
#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
|
||||
#endif
|
||||
|
@ -74,6 +72,20 @@ static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
|
||||
size_t *stacksize) {
|
||||
(void)attr;
|
||||
(void)stacksize;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
|
||||
size_t stacksize) {
|
||||
(void)attr;
|
||||
(void)stacksize;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
static INLINE int pthread_create(pthread_t *const thread,
|
||||
const pthread_attr_t *attr,
|
||||
unsigned int(__stdcall *start)(void *),
|
||||
|
|
|
@ -156,16 +156,18 @@ static int reset(AVxWorker *const worker) {
|
|||
// See: https://crbug.com/aomedia/3379
|
||||
#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
|
||||
!defined(NDEBUG)
|
||||
const size_t kMinStackSize = 1024 * 1024;
|
||||
#else
|
||||
const size_t kMinStackSize = 256 * 1024;
|
||||
#endif
|
||||
size_t stacksize;
|
||||
if (!pthread_attr_getstacksize(&attr, &stacksize)) {
|
||||
const size_t kMinStackSize = 1 << 20; // 1 MiB
|
||||
if (stacksize < kMinStackSize &&
|
||||
pthread_attr_setstacksize(&attr, kMinStackSize)) {
|
||||
pthread_attr_destroy(&attr);
|
||||
goto Error2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
pthread_mutex_lock(&worker->impl_->mutex_);
|
||||
ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
|
||||
if (ok) worker->status_ = AVX_WORKER_STATUS_OK;
|
||||
|
|
|
@ -266,6 +266,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
|
|||
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
|
||||
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
|
||||
"${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
|
||||
"${AOM_ROOT}/av1/common/x86/resize_sse2.c"
|
||||
"${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
|
||||
|
||||
list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
|
||||
|
@ -354,35 +355,36 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
|
|||
"${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/ml_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
|
||||
"${AOM_ROOT}/av1/encoder/arm/quantize_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
|
||||
"${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")
|
||||
|
||||
list(APPEND AOM_AV1_COMMON_INTRIN_NEON
|
||||
"${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c"
|
||||
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
|
||||
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
|
||||
"${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
|
||||
|
@ -414,6 +416,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SVE
|
|||
"${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c"
|
||||
"${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
|
||||
|
||||
list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
|
||||
"${AOM_ROOT}/av1/common/arm/convolve_sve2.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
|
||||
"${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
|
||||
|
||||
|
@ -452,7 +457,7 @@ if(CONFIG_AV1_TEMPORAL_DENOISING)
|
|||
"${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c")
|
||||
endif()
|
||||
|
||||
if(CONFIG_AV1_HIGHBITDEPTH)
|
||||
|
@ -499,9 +504,12 @@ if(CONFIG_AV1_HIGHBITDEPTH)
|
|||
"${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c"
|
||||
"${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c")
|
||||
|
||||
list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
|
||||
"${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c")
|
||||
endif()
|
||||
|
||||
if(CONFIG_ACCOUNTING)
|
||||
|
@ -527,7 +535,7 @@ if(CONFIG_REALTIME_ONLY)
|
|||
"${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
|
||||
|
||||
list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
|
||||
"${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c")
|
||||
"${AOM_ROOT}/av1/encoder/arm/cnn_neon.c")
|
||||
|
||||
list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
|
||||
"${AOM_ROOT}/av1/encoder/cnn.c"
|
||||
|
|
|
@ -674,6 +674,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
|
|||
RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
|
||||
RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
|
||||
|
||||
RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000);
|
||||
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
|
||||
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
|
||||
RANGE_CHECK_BOOL(extra_cfg, lossless);
|
||||
|
@ -1034,39 +1035,22 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
|
|||
}
|
||||
|
||||
TuneCfg *const tune_cfg = &oxcf->tune_cfg;
|
||||
|
||||
FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
|
||||
|
||||
TileConfig *const tile_cfg = &oxcf->tile_cfg;
|
||||
|
||||
ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
|
||||
|
||||
GFConfig *const gf_cfg = &oxcf->gf_cfg;
|
||||
|
||||
PartitionCfg *const part_cfg = &oxcf->part_cfg;
|
||||
|
||||
IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
|
||||
|
||||
TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
|
||||
|
||||
CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
|
||||
|
||||
SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
|
||||
|
||||
KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
|
||||
|
||||
DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
|
||||
|
||||
RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
|
||||
|
||||
QuantizationCfg *const q_cfg = &oxcf->q_cfg;
|
||||
|
||||
ColorCfg *const color_cfg = &oxcf->color_cfg;
|
||||
|
||||
InputCfg *const input_cfg = &oxcf->input_cfg;
|
||||
|
||||
AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
|
||||
|
||||
ToolCfg *const tool_cfg = &oxcf->tool_cfg;
|
||||
|
||||
const int is_vbr = cfg->rc_end_usage == AOM_VBR;
|
||||
|
@ -1610,37 +1594,42 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
|
|||
return AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) {
|
||||
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
|
||||
av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
|
||||
bool is_sb_size_changed = false;
|
||||
av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
|
||||
for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
|
||||
AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
|
||||
struct aom_internal_error_info *const error = cpi->common.error;
|
||||
if (setjmp(error->jmp)) {
|
||||
error->setjmp = 0;
|
||||
return error->error_code;
|
||||
}
|
||||
error->setjmp = 1;
|
||||
av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
|
||||
error->setjmp = 0;
|
||||
}
|
||||
if (ctx->ppi->cpi_lap != NULL) {
|
||||
AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
|
||||
struct aom_internal_error_info *const error = cpi_lap->common.error;
|
||||
if (setjmp(error->jmp)) {
|
||||
error->setjmp = 0;
|
||||
return error->error_code;
|
||||
}
|
||||
error->setjmp = 1;
|
||||
av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
|
||||
error->setjmp = 0;
|
||||
}
|
||||
return AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
|
||||
const struct av1_extracfg *extra_cfg) {
|
||||
const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
|
||||
if (res == AOM_CODEC_OK) {
|
||||
ctx->extra_cfg = *extra_cfg;
|
||||
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
|
||||
av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
|
||||
bool is_sb_size_changed = false;
|
||||
av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
|
||||
for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
|
||||
AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
|
||||
struct aom_internal_error_info *const error = cpi->common.error;
|
||||
if (setjmp(error->jmp)) {
|
||||
error->setjmp = 0;
|
||||
return error->error_code;
|
||||
}
|
||||
error->setjmp = 1;
|
||||
av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
|
||||
error->setjmp = 0;
|
||||
}
|
||||
if (ctx->ppi->cpi_lap != NULL) {
|
||||
AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
|
||||
struct aom_internal_error_info *const error = cpi_lap->common.error;
|
||||
if (setjmp(error->jmp)) {
|
||||
error->setjmp = 0;
|
||||
return error->error_code;
|
||||
}
|
||||
error->setjmp = 1;
|
||||
av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
|
||||
error->setjmp = 0;
|
||||
}
|
||||
return update_encoder_cfg(ctx);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
@ -3343,7 +3332,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
|
|||
if (ppi->cpi->oxcf.pass != 1) {
|
||||
ppi->total_time_compress_data += cpi->time_compress_data;
|
||||
ppi->total_recode_hits += cpi->frame_recode_hits;
|
||||
ppi->total_bytes += cpi->bytes;
|
||||
ppi->total_bytes += (uint64_t)cpi->bytes;
|
||||
for (int i = 0; i < MAX_MODES; i++) {
|
||||
ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
|
||||
}
|
||||
|
@ -3611,11 +3600,23 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
|
|||
aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
|
||||
|
||||
if (mode) {
|
||||
const int res = av1_set_internal_size(
|
||||
&ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
|
||||
mode->h_scaling_mode, mode->v_scaling_mode);
|
||||
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
|
||||
return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
|
||||
AV1EncoderConfig *const oxcf =
|
||||
ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf;
|
||||
const int res =
|
||||
av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params,
|
||||
mode->h_scaling_mode, mode->v_scaling_mode);
|
||||
if (res == 0) {
|
||||
// update_encoder_cfg() is somewhat costly and this control may be called
|
||||
// multiple times, so update_encoder_cfg() is only called to ensure frame
|
||||
// and superblock sizes are updated before they're fixed by the first
|
||||
// encode call.
|
||||
if (ctx->ppi->seq_params_locked) {
|
||||
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
|
||||
return AOM_CODEC_OK;
|
||||
}
|
||||
return update_encoder_cfg(ctx);
|
||||
}
|
||||
return AOM_CODEC_INVALID_PARAM;
|
||||
} else {
|
||||
return AOM_CODEC_INVALID_PARAM;
|
||||
}
|
||||
|
@ -3636,6 +3637,13 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
|
|||
if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
|
||||
return AOM_CODEC_INVALID_PARAM;
|
||||
ctx->ppi->number_spatial_layers = number_spatial_layers;
|
||||
// update_encoder_cfg() is somewhat costly and this control may be called
|
||||
// multiple times, so update_encoder_cfg() is only called to ensure frame and
|
||||
// superblock sizes are updated before they're fixed by the first encode
|
||||
// call.
|
||||
if (!ctx->ppi->seq_params_locked) {
|
||||
return update_encoder_cfg(ctx);
|
||||
}
|
||||
return AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
|
@ -3653,8 +3661,6 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
|
|||
va_list args) {
|
||||
AV1_PRIMARY *const ppi = ctx->ppi;
|
||||
AV1_COMP *const cpi = ppi->cpi;
|
||||
AV1_COMMON *const cm = &cpi->common;
|
||||
AV1EncoderConfig *oxcf = &cpi->oxcf;
|
||||
aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
|
||||
int64_t target_bandwidth = 0;
|
||||
ppi->number_spatial_layers = params->number_spatial_layers;
|
||||
|
@ -3694,19 +3700,38 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
|
|||
target_bandwidth += lc->layer_target_bitrate;
|
||||
}
|
||||
}
|
||||
if (cm->current_frame.frame_number == 0) {
|
||||
if (!cpi->ppi->seq_params_locked) {
|
||||
SequenceHeader *const seq_params = &ppi->seq_params;
|
||||
seq_params->operating_points_cnt_minus_1 =
|
||||
ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
|
||||
av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
|
||||
}
|
||||
|
||||
if (ppi->seq_params_locked) {
|
||||
AV1EncoderConfig *const oxcf = &cpi->oxcf;
|
||||
// Keep ctx->oxcf in sync in case further codec controls are made prior
|
||||
// to encoding.
|
||||
ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth =
|
||||
target_bandwidth;
|
||||
set_primary_rc_buffer_sizes(oxcf, ppi);
|
||||
av1_update_layer_context_change_config(cpi, target_bandwidth);
|
||||
check_reset_rc_flag(cpi);
|
||||
} else {
|
||||
// Note av1_init_layer_context() relies on cpi->oxcf. The order of that
|
||||
// call and the ones in the other half of this block (which
|
||||
// update_encoder_cfg() transitively makes) is important. So we keep
|
||||
// ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will
|
||||
// overwrite cpi->oxcf with ctx->oxcf.
|
||||
ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth =
|
||||
target_bandwidth;
|
||||
SequenceHeader *const seq_params = &ppi->seq_params;
|
||||
seq_params->operating_points_cnt_minus_1 =
|
||||
ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
|
||||
|
||||
av1_init_layer_context(cpi);
|
||||
// update_encoder_cfg() is somewhat costly and this control may be called
|
||||
// multiple times, so update_encoder_cfg() is only called to ensure frame
|
||||
// and superblock sizes are updated before they're fixed by the first
|
||||
// encode call.
|
||||
return update_encoder_cfg(ctx);
|
||||
}
|
||||
oxcf->rc_cfg.target_bandwidth = target_bandwidth;
|
||||
set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
|
||||
av1_update_layer_context_change_config(cpi, target_bandwidth);
|
||||
check_reset_rc_flag(cpi);
|
||||
} else if (!ppi->seq_params_locked) {
|
||||
// Ensure frame and superblock sizes are updated.
|
||||
return update_encoder_cfg(ctx);
|
||||
}
|
||||
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
|
||||
return AOM_CODEC_OK;
|
||||
|
|
|
@ -0,0 +1,702 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
|
||||
static INLINE int16x4_t compound_convolve8_4_v(
|
||||
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
|
||||
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
|
||||
const int32x4_t offset_const) {
|
||||
const int16x4_t filter_0_3 = vget_low_s16(filter);
|
||||
const int16x4_t filter_4_7 = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum = offset_const;
|
||||
sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
|
||||
sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
|
||||
sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
|
||||
sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
|
||||
sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
|
||||
|
||||
return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
|
||||
}
|
||||
|
||||
static INLINE int16x8_t compound_convolve8_8_v(
|
||||
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
|
||||
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
|
||||
const int32x4_t offset_const) {
|
||||
const int16x4_t filter_0_3 = vget_low_s16(filter);
|
||||
const int16x4_t filter_4_7 = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum0 = offset_const;
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
|
||||
|
||||
int32x4_t sum1 = offset_const;
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
|
||||
|
||||
int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
|
||||
int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
|
||||
|
||||
return vcombine_s16(res0, res1);
|
||||
}
|
||||
|
||||
static INLINE void compound_convolve_vert_scale_neon(
|
||||
const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
|
||||
int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
|
||||
const int bd = 8;
|
||||
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
|
||||
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
|
||||
// non-rounding shifts - which are generally faster than rounding shifts on
|
||||
// modern CPUs.
|
||||
const int32x4_t vert_offset =
|
||||
vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
|
||||
|
||||
int y_qn = subpel_y_qn;
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
vst1_u16(dst, vreinterpret_u16_s16(d0));
|
||||
|
||||
dst += dst_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
} else {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int width = w;
|
||||
uint16_t *d = dst;
|
||||
|
||||
do {
|
||||
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
vst1q_u16(d, vreinterpretq_u16_s16(d0));
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
|
||||
dst += dst_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void compound_avg_convolve_vert_scale_neon(
|
||||
const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
|
||||
uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
|
||||
int subpel_y_qn, int y_step_qn) {
|
||||
const int bd = 8;
|
||||
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
|
||||
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
|
||||
// non-rounding shifts - which are generally faster than rounding shifts
|
||||
// on modern CPUs.
|
||||
const int32_t vert_offset_bits =
|
||||
(1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
|
||||
// For the averaging code path substract round offset and convolve round.
|
||||
const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
|
||||
const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
|
||||
|
||||
int y_qn = subpel_y_qn;
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
|
||||
|
||||
int16x4_t avg = vhadd_s16(dd0, d0);
|
||||
int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
|
||||
|
||||
uint8x8_t d0_u8 = vqrshrun_n_s16(
|
||||
d0_s16, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
|
||||
|
||||
store_u8_4x1(dst8, d0_u8);
|
||||
|
||||
dst16 += dst16_stride;
|
||||
dst8 += dst8_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
} else {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int width = w;
|
||||
uint8_t *dst8_ptr = dst8;
|
||||
uint16_t *dst16_ptr = dst16;
|
||||
|
||||
do {
|
||||
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
|
||||
|
||||
int16x8_t avg = vhaddq_s16(dd0, d0);
|
||||
|
||||
uint8x8_t d0_u8 = vqrshrun_n_s16(
|
||||
avg, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
|
||||
|
||||
vst1_u8(dst8_ptr, d0_u8);
|
||||
|
||||
s += 8;
|
||||
dst8_ptr += 8;
|
||||
dst16_ptr += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
|
||||
dst16 += dst16_stride;
|
||||
dst8 += dst8_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
|
||||
const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
|
||||
uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
|
||||
ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
|
||||
const int bd = 8;
|
||||
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
|
||||
int y_qn = subpel_y_qn;
|
||||
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
|
||||
// non-rounding shifts - which are generally faster than rounding shifts on
|
||||
// modern CPUs.
|
||||
const int32x4_t vert_offset =
|
||||
vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
|
||||
// For the weighted averaging code path we have to substract round offset and
|
||||
// convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
|
||||
// COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
|
||||
// additional shift by DIST_PRECISION_BITS is needed in order to merge two
|
||||
// shift calculations into one.
|
||||
const int32x4_t dist_wtd_offset = vdupq_n_s32(
|
||||
(1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
|
||||
DIST_PRECISION_BITS)) -
|
||||
(1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
|
||||
(1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
|
||||
const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
|
||||
const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
|
||||
|
||||
int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
|
||||
dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
|
||||
|
||||
int16x4_t d0_s16 = vshrn_n_s32(
|
||||
dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
|
||||
DIST_PRECISION_BITS);
|
||||
|
||||
uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
|
||||
|
||||
store_u8_4x1(dst8, d0_u8);
|
||||
|
||||
dst16 += dst16_stride;
|
||||
dst8 += dst8_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
} else {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int width = w;
|
||||
uint8_t *dst8_ptr = dst8;
|
||||
uint16_t *dst16_ptr = dst16;
|
||||
|
||||
do {
|
||||
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
filter, vert_offset);
|
||||
|
||||
int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
|
||||
|
||||
int32x4_t dst_wtd_avg0 =
|
||||
vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
|
||||
int32x4_t dst_wtd_avg1 =
|
||||
vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
|
||||
|
||||
dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
|
||||
dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
|
||||
|
||||
int16x4_t d0_s16_0 = vshrn_n_s32(
|
||||
dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
|
||||
DIST_PRECISION_BITS);
|
||||
int16x4_t d0_s16_1 = vshrn_n_s32(
|
||||
dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
|
||||
DIST_PRECISION_BITS);
|
||||
|
||||
uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
|
||||
|
||||
vst1_u8(dst8_ptr, d0_u8);
|
||||
|
||||
s += 8;
|
||||
dst8_ptr += 8;
|
||||
dst16_ptr += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
|
||||
dst16 += dst16_stride;
|
||||
dst8 += dst8_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7,
|
||||
const int16x8_t filter,
|
||||
const int32x4_t offset_const) {
|
||||
const int16x4_t filter_0_3 = vget_low_s16(filter);
|
||||
const int16x4_t filter_4_7 = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum = offset_const;
|
||||
sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
|
||||
sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
|
||||
sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
|
||||
sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
|
||||
sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
|
||||
|
||||
int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
|
||||
|
||||
return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7,
|
||||
const int16x8_t filter,
|
||||
const int32x4_t offset_const) {
|
||||
const int16x4_t filter_0_3 = vget_low_s16(filter);
|
||||
const int16x4_t filter_4_7 = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum0 = offset_const;
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
|
||||
|
||||
int32x4_t sum1 = offset_const;
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
|
||||
|
||||
int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
|
||||
int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
|
||||
|
||||
return vqmovun_s16(vcombine_s16(res0, res1));
|
||||
}
|
||||
|
||||
static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride, int w,
|
||||
int h, const int16_t *y_filter,
|
||||
int subpel_y_qn, int y_step_qn) {
|
||||
const int bd = 8;
|
||||
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
|
||||
const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
|
||||
// The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
|
||||
int32x4_t vert_offset =
|
||||
vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
|
||||
|
||||
int y_qn = subpel_y_qn;
|
||||
if (w == 4) {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
uint8x8_t d =
|
||||
convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
|
||||
|
||||
store_u8_4x1(dst, d);
|
||||
|
||||
dst += dst_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
} else if (w == 8) {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
|
||||
|
||||
uint8x8_t d =
|
||||
convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
|
||||
|
||||
vst1_u8(dst, d);
|
||||
|
||||
dst += dst_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
} else {
|
||||
do {
|
||||
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
|
||||
uint8_t *d = dst;
|
||||
int width = w;
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
|
||||
|
||||
do {
|
||||
int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
|
||||
load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
|
||||
&s5[0], &s6[0], &s7[0]);
|
||||
load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
|
||||
&s5[1], &s6[1], &s7[1]);
|
||||
|
||||
uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
|
||||
s6[0], s7[0], filter, vert_offset);
|
||||
uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
|
||||
s6[1], s7[1], filter, vert_offset);
|
||||
|
||||
vst1q_u8(d, vcombine_u8(d0, d1));
|
||||
|
||||
s += 16;
|
||||
d += 16;
|
||||
width -= 16;
|
||||
} while (width != 0);
|
||||
|
||||
dst += dst_stride;
|
||||
y_qn += y_step_qn;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
const int16x4_t s4, const int16x4_t s5,
|
||||
const int16x4_t s6, const int16x4_t s7,
|
||||
const int16x8_t filter,
|
||||
const int32x4_t horiz_const) {
|
||||
int16x4_t filter_lo = vget_low_s16(filter);
|
||||
int16x4_t filter_hi = vget_high_s16(filter);
|
||||
|
||||
int32x4_t sum = horiz_const;
|
||||
sum = vmlal_lane_s16(sum, s0, filter_lo, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
|
||||
sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
|
||||
sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
|
||||
sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
|
||||
sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
|
||||
|
||||
return vshrn_n_s32(sum, ROUND0_BITS);
|
||||
}
|
||||
|
||||
static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x8_t s4, const int16x8_t s5,
|
||||
const int16x8_t s6, const int16x8_t s7,
|
||||
const int16x8_t filter,
|
||||
const int16x8_t horiz_const) {
|
||||
int16x4_t filter_lo = vget_low_s16(filter);
|
||||
int16x4_t filter_hi = vget_high_s16(filter);
|
||||
|
||||
int16x8_t sum = horiz_const;
|
||||
sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
|
||||
sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
|
||||
sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
|
||||
sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
|
||||
sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
|
||||
|
||||
return vshrq_n_s16(sum, ROUND0_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
|
||||
int16_t *dst, int dst_stride,
|
||||
int w, int h,
|
||||
const int16_t *x_filter,
|
||||
const int subpel_x_qn,
|
||||
const int x_step_qn) {
|
||||
DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
|
||||
const int bd = 8;
|
||||
|
||||
if (w == 4) {
|
||||
// The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
|
||||
const int32x4_t horiz_offset =
|
||||
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
|
||||
|
||||
do {
|
||||
int x_qn = subpel_x_qn;
|
||||
|
||||
// Process a 4x4 tile.
|
||||
for (int r = 0; r < 4; ++r) {
|
||||
const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
|
||||
|
||||
uint8x8_t t0, t1, t2, t3;
|
||||
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
|
||||
|
||||
transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
|
||||
|
||||
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
|
||||
int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
|
||||
int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
|
||||
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
|
||||
int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
|
||||
int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
|
||||
|
||||
int16x4_t d0 =
|
||||
convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
|
||||
|
||||
vst1_s16(&temp[r * 4], d0);
|
||||
x_qn += x_step_qn;
|
||||
}
|
||||
|
||||
// Transpose the 4x4 result tile and store.
|
||||
int16x4_t d0, d1, d2, d3;
|
||||
load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
|
||||
|
||||
transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
|
||||
|
||||
store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
dst += 4 * dst_stride;
|
||||
src += 4 * src_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
// The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
|
||||
// The additional -1 is needed because we are halving the filter values.
|
||||
const int16x8_t horiz_offset =
|
||||
vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
|
||||
|
||||
do {
|
||||
int x_qn = subpel_x_qn;
|
||||
int16_t *d = dst;
|
||||
int width = w;
|
||||
|
||||
do {
|
||||
// Process an 8x8 tile.
|
||||
for (int r = 0; r < 8; ++r) {
|
||||
const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
|
||||
|
||||
const ptrdiff_t filter_offset =
|
||||
SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
|
||||
int16x8_t filter = vld1q_s16(x_filter + filter_offset);
|
||||
// Filter values are all even so halve them to allow convolution
|
||||
// kernel computations to stay in 16-bit element types.
|
||||
filter = vshrq_n_s16(filter, 1);
|
||||
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
|
||||
&t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
|
||||
|
||||
int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter,
|
||||
horiz_offset);
|
||||
|
||||
vst1q_s16(&temp[r * 8], d0);
|
||||
|
||||
x_qn += x_step_qn;
|
||||
}
|
||||
|
||||
// Transpose the 8x8 result tile and store.
|
||||
int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
|
||||
load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
|
||||
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
|
||||
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
|
||||
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
|
||||
dst += 8 * dst_stride;
|
||||
src += 8 * src_stride;
|
||||
h -= 8;
|
||||
} while (h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride, int w, int h,
|
||||
const InterpFilterParams *filter_params_x,
|
||||
const InterpFilterParams *filter_params_y,
|
||||
const int subpel_x_qn, const int x_step_qn,
|
||||
const int subpel_y_qn, const int y_step_qn,
|
||||
ConvolveParams *conv_params) {
|
||||
if (w < 4 || h < 4) {
|
||||
av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params_x, filter_params_y, subpel_x_qn,
|
||||
x_step_qn, subpel_y_qn, y_step_qn, conv_params);
|
||||
return;
|
||||
}
|
||||
|
||||
// For the interpolation 8-tap filters are used.
|
||||
assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
|
||||
|
||||
DECLARE_ALIGNED(32, int16_t,
|
||||
im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
|
||||
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
|
||||
filter_params_y->taps;
|
||||
int im_stride = MAX_SB_SIZE;
|
||||
CONV_BUF_TYPE *dst16 = conv_params->dst;
|
||||
const int dst16_stride = conv_params->dst_stride;
|
||||
|
||||
// Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
|
||||
// lines post both horizontally and vertically.
|
||||
const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
|
||||
const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
|
||||
|
||||
// Horizontal filter
|
||||
convolve_horiz_scale_neon(
|
||||
src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
|
||||
im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
|
||||
|
||||
// Vertical filter
|
||||
if (UNLIKELY(conv_params->is_compound)) {
|
||||
if (conv_params->do_average) {
|
||||
if (conv_params->use_dist_wtd_comp_avg) {
|
||||
compound_dist_wtd_convolve_vert_scale_neon(
|
||||
im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
|
||||
filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
|
||||
} else {
|
||||
compound_avg_convolve_vert_scale_neon(
|
||||
im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
|
||||
filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
|
||||
}
|
||||
} else {
|
||||
compound_convolve_vert_scale_neon(
|
||||
im_block, im_stride, dst16, dst16_stride, w, h,
|
||||
filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
|
||||
}
|
||||
} else {
|
||||
convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
|
||||
filter_params_y->filter_ptr, subpel_y_qn,
|
||||
y_step_qn);
|
||||
}
|
||||
}
|
|
@ -447,7 +447,7 @@ static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
|
|||
out[7] = step1;
|
||||
}
|
||||
|
||||
void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
|
||||
static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
|
||||
assert(!(size % 4));
|
||||
if (!bit) return;
|
||||
const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
|
||||
|
@ -3661,7 +3661,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
|
|||
round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
|
||||
}
|
||||
row_txfm(cur_a, cur_a, INV_COS_BIT);
|
||||
av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
|
||||
round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
|
||||
if (lr_flip == 1) {
|
||||
for (int j = 0; j < buf_size_w_div8; ++j) {
|
||||
flip_buf_ud_neon(&cur_a[j * 8], 8);
|
||||
|
@ -3736,8 +3736,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
|
|||
}
|
||||
for (int j = 0; j < buf_size_w_div8; ++j) {
|
||||
col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
|
||||
av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
|
||||
-shift[1]);
|
||||
round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
|
||||
}
|
||||
if (txfm_size_col >= 16) {
|
||||
for (int i = 0; i < (txfm_size_col >> 4); i++) {
|
||||
|
@ -3814,8 +3813,9 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
|
|||
}
|
||||
}
|
||||
|
||||
void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type, int eob) {
|
||||
static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type,
|
||||
int eob) {
|
||||
(void)eob;
|
||||
TX_SIZE tx_size = TX_4X8;
|
||||
DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
|
||||
|
@ -3879,8 +3879,9 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type, int eob) {
|
||||
static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type,
|
||||
int eob) {
|
||||
(void)eob;
|
||||
TX_SIZE tx_size = TX_8X4;
|
||||
DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
|
||||
|
@ -3944,8 +3945,9 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type, int eob) {
|
||||
static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
|
||||
uint8_t *output, int stride,
|
||||
TX_TYPE tx_type, int eob) {
|
||||
(void)eob;
|
||||
TX_SIZE tx_size = TX_4X16;
|
||||
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
|
||||
|
@ -4008,8 +4010,9 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
|
||||
int stride, TX_TYPE tx_type, int eob) {
|
||||
static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
|
||||
uint8_t *output, int stride,
|
||||
TX_TYPE tx_type, int eob) {
|
||||
(void)eob;
|
||||
TX_SIZE tx_size = TX_16X4;
|
||||
DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
|
||||
|
@ -4112,7 +4115,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
|
|||
round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
|
||||
}
|
||||
row_txfm(cur_a, cur_a, INV_COS_BIT);
|
||||
av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
|
||||
round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
|
||||
if (lr_flip == 1) {
|
||||
for (int j = 0; j < buf_size_w_div8; ++j) {
|
||||
flip_buf_ud_neon(&cur_a[j * 8], 8);
|
||||
|
@ -4130,8 +4133,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
|
|||
}
|
||||
for (int j = 0; j < buf_size_w_div8; ++j) {
|
||||
col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
|
||||
av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
|
||||
-shift[1]);
|
||||
round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
|
||||
}
|
||||
|
||||
if (txfm_size_col >= 16) {
|
||||
|
|
|
@ -188,18 +188,95 @@ static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
|
|||
#endif // AOM_ARCH_AARCH64
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t filter,
|
||||
const int16x4_t horiz_const) {
|
||||
int16x4_t sum = horiz_const;
|
||||
sum = vmla_lane_s16(sum, s0, filter, 0);
|
||||
sum = vmla_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmla_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmla_lane_s16(sum, s3, filter, 3);
|
||||
int16x8_t horiz_const) {
|
||||
int16x8_t sum = horiz_const;
|
||||
sum = vmlaq_lane_s16(sum, s0, filter, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter, 3);
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
// We halved the convolution filter values so - 1 from the right shift.
|
||||
return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
|
||||
static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
|
||||
int src_stride, uint8_t *dst_ptr,
|
||||
const int dst_stride, int w, int h,
|
||||
const int16_t *x_filter_ptr) {
|
||||
// All filter values are even, halve to reduce intermediate precision
|
||||
// requirements.
|
||||
const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
|
||||
|
||||
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
|
||||
// rounding right shift by FILTER_BITS - instead of a first rounding right
|
||||
// shift by ROUND0_BITS, followed by second rounding right shift by
|
||||
// FILTER_BITS - ROUND0_BITS.
|
||||
// The outermost -1 is needed because we will halve the filter values.
|
||||
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint8x8_t t01[4];
|
||||
t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
|
||||
t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
|
||||
t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
|
||||
t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
|
||||
|
||||
int16x8_t s01[4];
|
||||
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
|
||||
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
|
||||
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
|
||||
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
|
||||
|
||||
uint8x8_t d01 =
|
||||
convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
|
||||
|
||||
store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
|
||||
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h != 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint8_t *s = src_ptr;
|
||||
uint8_t *d = dst_ptr;
|
||||
|
||||
do {
|
||||
uint8x8_t t0[4], t1[4];
|
||||
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
|
||||
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
|
||||
|
||||
int16x8_t s0[4], s1[4];
|
||||
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
|
||||
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
|
||||
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
|
||||
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
|
||||
|
||||
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
|
||||
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
|
||||
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
|
||||
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
|
||||
|
||||
uint8x8_t d0 =
|
||||
convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
|
||||
uint8x8_t d1 =
|
||||
convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
|
||||
|
||||
store_u8_8x2(d, dst_stride, d0, d1);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
|
||||
|
@ -242,12 +319,20 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
|
||||
filter_params_x, subpel_x_qn & SUBPEL_MASK);
|
||||
|
||||
if (filter_params_x->taps > 8) {
|
||||
int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
|
||||
|
||||
if (filter_taps > 8) {
|
||||
convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
|
||||
x_filter_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (filter_taps <= 4) {
|
||||
convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
|
||||
x_filter_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
|
||||
// rounding right shift by FILTER_BITS - instead of a first rounding right
|
||||
// shift by ROUND0_BITS, followed by second rounding right shift by
|
||||
|
@ -255,149 +340,220 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
// The outermost -1 is needed because we will halve the filter values.
|
||||
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
|
||||
|
||||
if (w <= 4) {
|
||||
// 4-tap filters are used for blocks having width <= 4.
|
||||
// Filter values are even, so halve to reduce intermediate precision reqs.
|
||||
const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
|
||||
|
||||
src += 2;
|
||||
|
||||
do {
|
||||
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
|
||||
int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
|
||||
int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
|
||||
int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
|
||||
|
||||
uint8x8_t d0 =
|
||||
convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
|
||||
|
||||
store_u8_4x1(dst, d0);
|
||||
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
} while (--h != 0);
|
||||
} else {
|
||||
// Filter values are even so halve to reduce precision requirements.
|
||||
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
|
||||
// Filter values are even so halve to reduce precision requirements.
|
||||
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
|
||||
|
||||
#if AOM_ARCH_AARCH64
|
||||
while (h >= 8) {
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
while (h >= 8) {
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
|
||||
int width = w;
|
||||
const uint8_t *s = src + 7;
|
||||
uint8_t *d = dst;
|
||||
|
||||
__builtin_prefetch(d + 0 * dst_stride);
|
||||
__builtin_prefetch(d + 1 * dst_stride);
|
||||
__builtin_prefetch(d + 2 * dst_stride);
|
||||
__builtin_prefetch(d + 3 * dst_stride);
|
||||
__builtin_prefetch(d + 4 * dst_stride);
|
||||
__builtin_prefetch(d + 5 * dst_stride);
|
||||
__builtin_prefetch(d + 6 * dst_stride);
|
||||
__builtin_prefetch(d + 7 * dst_stride);
|
||||
|
||||
do {
|
||||
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
|
||||
load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
|
||||
&t14);
|
||||
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
|
||||
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
|
||||
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
|
||||
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
|
||||
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
|
||||
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
|
||||
|
||||
uint8x8_t d0 =
|
||||
convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
|
||||
uint8x8_t d1 =
|
||||
convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
|
||||
uint8x8_t d2 =
|
||||
convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
|
||||
uint8x8_t d3 =
|
||||
convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
|
||||
uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
x_filter, horiz_const);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
|
||||
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
|
||||
|
||||
s0 = s8;
|
||||
s1 = s9;
|
||||
s2 = s10;
|
||||
s3 = s11;
|
||||
s4 = s12;
|
||||
s5 = s13;
|
||||
s6 = s14;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += 8 * src_stride;
|
||||
dst += 8 * dst_stride;
|
||||
h -= 8;
|
||||
}
|
||||
#endif // AOM_ARCH_AARCH64
|
||||
|
||||
while (h-- != 0) {
|
||||
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
|
||||
int width = w;
|
||||
const uint8_t *s = src + 8;
|
||||
uint8_t *d = dst;
|
||||
|
||||
__builtin_prefetch(d);
|
||||
|
||||
do {
|
||||
uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
|
||||
|
||||
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
|
||||
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
|
||||
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
|
||||
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
|
||||
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
|
||||
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
|
||||
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
|
||||
|
||||
uint8x8_t d0 =
|
||||
convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
|
||||
|
||||
vst1_u8(d, d0);
|
||||
|
||||
s0 = s8;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t filter) {
|
||||
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter, 3);
|
||||
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
|
||||
const int src_stride, uint8_t *dst,
|
||||
const int dst_stride, int w, int h,
|
||||
const int16_t *filter_y) {
|
||||
// All filter values are even, halve to reduce intermediate precision
|
||||
// requirements.
|
||||
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
|
||||
|
||||
if (w == 4) {
|
||||
uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
|
||||
uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
|
||||
|
||||
int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
|
||||
|
||||
src += 2 * src_stride;
|
||||
|
||||
do {
|
||||
uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
|
||||
uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
|
||||
uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
|
||||
uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
|
||||
|
||||
int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
|
||||
int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
|
||||
int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
|
||||
int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
|
||||
|
||||
uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
|
||||
uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
|
||||
|
||||
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
s01 = s45;
|
||||
s12 = s56;
|
||||
|
||||
src += 4 * src_stride;
|
||||
dst += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h != 0);
|
||||
} else {
|
||||
do {
|
||||
uint8x8_t t0, t1, t2;
|
||||
load_u8_8x3(src, src_stride, &t0, &t1, &t2);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
|
||||
int width = w;
|
||||
const uint8_t *s = src + 7;
|
||||
int height = h;
|
||||
const uint8_t *s = src + 3 * src_stride;
|
||||
uint8_t *d = dst;
|
||||
|
||||
__builtin_prefetch(d + 0 * dst_stride);
|
||||
__builtin_prefetch(d + 1 * dst_stride);
|
||||
__builtin_prefetch(d + 2 * dst_stride);
|
||||
__builtin_prefetch(d + 3 * dst_stride);
|
||||
__builtin_prefetch(d + 4 * dst_stride);
|
||||
__builtin_prefetch(d + 5 * dst_stride);
|
||||
__builtin_prefetch(d + 6 * dst_stride);
|
||||
__builtin_prefetch(d + 7 * dst_stride);
|
||||
|
||||
do {
|
||||
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
|
||||
load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
|
||||
uint8x8_t t3;
|
||||
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
|
||||
&t14);
|
||||
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
|
||||
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
|
||||
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
|
||||
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
|
||||
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
|
||||
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
|
||||
uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
|
||||
horiz_const);
|
||||
uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
|
||||
x_filter, horiz_const);
|
||||
uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
|
||||
x_filter, horiz_const);
|
||||
uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
x_filter, horiz_const);
|
||||
uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
|
||||
uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
|
||||
uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
|
||||
uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s0 = s8;
|
||||
s1 = s9;
|
||||
s2 = s10;
|
||||
s3 = s11;
|
||||
s4 = s12;
|
||||
s5 = s13;
|
||||
s6 = s14;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += 8 * src_stride;
|
||||
dst += 8 * dst_stride;
|
||||
h -= 8;
|
||||
}
|
||||
#endif // AOM_ARCH_AARCH64
|
||||
|
||||
while (h-- != 0) {
|
||||
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
|
||||
int width = w;
|
||||
const uint8_t *s = src + 8;
|
||||
uint8_t *d = dst;
|
||||
|
||||
__builtin_prefetch(d);
|
||||
|
||||
do {
|
||||
uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
|
||||
|
||||
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
|
||||
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
|
||||
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
|
||||
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
|
||||
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
|
||||
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
|
||||
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
|
||||
|
||||
uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
|
||||
horiz_const);
|
||||
|
||||
vst1_u8(d, d0);
|
||||
|
||||
s0 = s8;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height != 0);
|
||||
src += 8;
|
||||
dst += 8;
|
||||
w -= 8;
|
||||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -974,7 +1130,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
}
|
||||
|
||||
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
|
||||
const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
|
||||
const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
|
||||
const int vert_offset = clamped_y_taps / 2 - 1;
|
||||
|
||||
src -= vert_offset * src_stride;
|
||||
|
@ -991,7 +1147,10 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
// Filter values are even so halve to reduce precision requirements.
|
||||
const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
|
||||
|
||||
if (y_filter_taps < 8) {
|
||||
if (y_filter_taps <= 4) {
|
||||
convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
|
||||
y_filter_ptr);
|
||||
} else if (y_filter_taps == 6) {
|
||||
convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
|
||||
} else {
|
||||
convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
|
||||
|
@ -1148,18 +1307,122 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
|
|||
} while (--h != 0);
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t filter,
|
||||
const int16x4_t horiz_const) {
|
||||
int16x4_t sum = horiz_const;
|
||||
sum = vmla_lane_s16(sum, s0, filter, 0);
|
||||
sum = vmla_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmla_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmla_lane_s16(sum, s3, filter, 3);
|
||||
const int16x8_t horiz_const) {
|
||||
int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
|
||||
sum = vmlaq_lane_s16(sum, s1, filter, 1);
|
||||
sum = vmlaq_lane_s16(sum, s2, filter, 2);
|
||||
sum = vmlaq_lane_s16(sum, s3, filter, 3);
|
||||
// We halved the filter values so -1 from right shift.
|
||||
return vshrq_n_s16(sum, ROUND0_BITS - 1);
|
||||
}
|
||||
|
||||
// We halved the convolution filter values so -1 from the right shift.
|
||||
return vshr_n_s16(sum, ROUND0_BITS - 1);
|
||||
static INLINE void convolve_2d_sr_horiz_4tap_neon(
|
||||
const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
|
||||
ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
|
||||
const int bd = 8;
|
||||
// All filter values are even, halve to reduce intermediate precision
|
||||
// requirements.
|
||||
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
|
||||
|
||||
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
|
||||
// shifts - which are generally faster than rounding shifts on modern CPUs.
|
||||
// (The extra -1 is needed because we halved the filter values.)
|
||||
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
|
||||
(1 << ((ROUND0_BITS - 1) - 1)));
|
||||
|
||||
if (w == 4) {
|
||||
do {
|
||||
uint8x8_t t01[4];
|
||||
t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
|
||||
t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
|
||||
t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
|
||||
t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
|
||||
|
||||
int16x8_t s01[4];
|
||||
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
|
||||
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
|
||||
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
|
||||
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
|
||||
|
||||
int16x8_t d01 =
|
||||
convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
|
||||
|
||||
store_s16x4_strided_x2(dst, (int)dst_stride, d01);
|
||||
|
||||
src += 2 * src_stride;
|
||||
dst += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else {
|
||||
do {
|
||||
int width = w;
|
||||
const uint8_t *s = src;
|
||||
int16_t *d = dst;
|
||||
|
||||
do {
|
||||
uint8x8_t t0[4], t1[4];
|
||||
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
|
||||
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
|
||||
|
||||
int16x8_t s0[4];
|
||||
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
|
||||
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
|
||||
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
|
||||
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
|
||||
|
||||
int16x8_t s1[4];
|
||||
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
|
||||
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
|
||||
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
|
||||
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
|
||||
|
||||
int16x8_t d0 =
|
||||
convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
|
||||
int16x8_t d1 =
|
||||
convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
|
||||
|
||||
store_s16_8x2(d, dst_stride, d0, d1);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += 2 * src_stride;
|
||||
dst += 2 * dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 2);
|
||||
|
||||
do {
|
||||
const uint8_t *s = src;
|
||||
int16_t *d = dst;
|
||||
int width = w;
|
||||
|
||||
do {
|
||||
uint8x8_t t0[4];
|
||||
load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
|
||||
|
||||
int16x8_t s0[4];
|
||||
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
|
||||
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
|
||||
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
|
||||
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
|
||||
|
||||
int16x8_t d0 =
|
||||
convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
|
||||
|
||||
vst1q_s16(d, d0);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
|
||||
|
@ -1185,10 +1448,9 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
|
|||
return vshrq_n_s16(sum, ROUND0_BITS - 1);
|
||||
}
|
||||
|
||||
static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
|
||||
int16_t *im_block, int im_stride,
|
||||
int w, int im_h,
|
||||
const int16_t *x_filter_ptr) {
|
||||
static INLINE void convolve_2d_sr_horiz_8tap_neon(
|
||||
const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
|
||||
int im_h, const int16_t *x_filter_ptr) {
|
||||
const int bd = 8;
|
||||
|
||||
const uint8_t *src_ptr = src;
|
||||
|
@ -1196,149 +1458,119 @@ static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
|
|||
int dst_stride = im_stride;
|
||||
int height = im_h;
|
||||
|
||||
if (w <= 4) {
|
||||
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
|
||||
// shifts - which are generally faster than rounding shifts on modern CPUs.
|
||||
// (The extra -1 is needed because we halved the filter values.)
|
||||
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
|
||||
(1 << ((ROUND0_BITS - 1) - 1)));
|
||||
// 4-tap filters are used for blocks having width <= 4.
|
||||
// Filter values are even, so halve to reduce intermediate precision reqs.
|
||||
const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
|
||||
|
||||
src_ptr += 2;
|
||||
|
||||
do {
|
||||
uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
|
||||
|
||||
int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
|
||||
int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
|
||||
int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
|
||||
|
||||
int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
|
||||
|
||||
vst1_s16(dst_ptr, d0);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
} while (--height != 0);
|
||||
} else {
|
||||
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
|
||||
// shifts - which are generally faster than rounding shifts on modern CPUs.
|
||||
// (The extra -1 is needed because we halved the filter values.)
|
||||
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
|
||||
(1 << ((ROUND0_BITS - 1) - 1)));
|
||||
// Filter values are even, so halve to reduce intermediate precision reqs.
|
||||
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
|
||||
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
|
||||
// shifts - which are generally faster than rounding shifts on modern CPUs.
|
||||
// (The extra -1 is needed because we halved the filter values.)
|
||||
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
|
||||
(1 << ((ROUND0_BITS - 1) - 1)));
|
||||
// Filter values are even, so halve to reduce intermediate precision reqs.
|
||||
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
|
||||
|
||||
#if AOM_ARCH_AARCH64
|
||||
while (height > 8) {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
while (height > 8) {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
|
||||
s += 7;
|
||||
|
||||
do {
|
||||
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
|
||||
|
||||
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
x_filter, horiz_const);
|
||||
|
||||
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
|
||||
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
|
||||
|
||||
s0 = s8;
|
||||
s1 = s9;
|
||||
s2 = s10;
|
||||
s3 = s11;
|
||||
s4 = s12;
|
||||
s5 = s13;
|
||||
s6 = s14;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += 8 * src_stride;
|
||||
dst_ptr += 8 * dst_stride;
|
||||
height -= 8;
|
||||
}
|
||||
#endif // AOM_ARCH_AARCH64
|
||||
s += 7;
|
||||
|
||||
do {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
|
||||
|
||||
do {
|
||||
uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
|
||||
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
|
||||
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
|
||||
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
|
||||
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
|
||||
|
||||
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
|
||||
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
|
||||
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
|
||||
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
|
||||
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
|
||||
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
|
||||
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
|
||||
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
|
||||
horiz_const);
|
||||
int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
|
||||
horiz_const);
|
||||
int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
|
||||
horiz_const);
|
||||
int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
|
||||
horiz_const);
|
||||
int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
|
||||
x_filter, horiz_const);
|
||||
int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
x_filter, horiz_const);
|
||||
|
||||
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
x_filter, horiz_const);
|
||||
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
|
||||
|
||||
vst1q_s16(d, d0);
|
||||
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
|
||||
|
||||
s0 = s8;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
} while (--height != 0);
|
||||
s0 = s8;
|
||||
s1 = s9;
|
||||
s2 = s10;
|
||||
s3 = s11;
|
||||
s4 = s12;
|
||||
s5 = s13;
|
||||
s6 = s14;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += 8 * src_stride;
|
||||
dst_ptr += 8 * dst_stride;
|
||||
height -= 8;
|
||||
}
|
||||
#endif // AOM_ARCH_AARCH64
|
||||
|
||||
do {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
|
||||
uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
|
||||
do {
|
||||
uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
|
||||
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
|
||||
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
|
||||
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
|
||||
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
|
||||
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
|
||||
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
|
||||
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
|
||||
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
|
||||
|
||||
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
|
||||
horiz_const);
|
||||
|
||||
vst1q_s16(d, d0);
|
||||
|
||||
s0 = s8;
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
} while (--height != 0);
|
||||
}
|
||||
|
||||
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
|
@ -1355,7 +1587,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
}
|
||||
|
||||
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
|
||||
const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
|
||||
const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
|
||||
const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
|
||||
const int im_h = h + clamped_y_taps - 1;
|
||||
const int im_stride = MAX_SB_SIZE;
|
||||
const int vert_offset = clamped_y_taps / 2 - 1;
|
||||
|
@ -1385,12 +1618,20 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
DECLARE_ALIGNED(16, int16_t,
|
||||
im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
|
||||
|
||||
convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
|
||||
x_filter_ptr);
|
||||
if (x_filter_taps <= 4) {
|
||||
convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
|
||||
im_stride, w, im_h, x_filter_ptr);
|
||||
} else {
|
||||
convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
|
||||
w, im_h, x_filter_ptr);
|
||||
}
|
||||
|
||||
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
|
||||
|
||||
if (clamped_y_taps <= 6) {
|
||||
if (clamped_y_taps <= 4) {
|
||||
convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
|
||||
y_filter_ptr);
|
||||
} else if (clamped_y_taps == 6) {
|
||||
convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
|
||||
y_filter);
|
||||
} else {
|
||||
|
|
|
@ -535,4 +535,112 @@ static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
const int16x4_t y_filter) {
|
||||
int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
|
||||
sum = vmlal_lane_s16(sum, s1, y_filter, 1);
|
||||
sum = vmlal_lane_s16(sum, s2, y_filter, 2);
|
||||
sum = vmlal_lane_s16(sum, s3, y_filter, 3);
|
||||
|
||||
return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
|
||||
}
|
||||
|
||||
static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x8_t s2, const int16x8_t s3,
|
||||
const int16x4_t y_filter,
|
||||
const int16x8_t sub_const) {
|
||||
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
|
||||
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
|
||||
|
||||
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
|
||||
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
|
||||
|
||||
int16x8_t res =
|
||||
vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
|
||||
vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
|
||||
res = vsubq_s16(res, sub_const);
|
||||
|
||||
return vqmovun_s16(res);
|
||||
}
|
||||
|
||||
static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
|
||||
int src_stride,
|
||||
uint8_t *dst_ptr,
|
||||
int dst_stride, int w, int h,
|
||||
const int16_t *y_filter) {
|
||||
const int bd = 8;
|
||||
const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
|
||||
|
||||
const int16x4_t filter = vld1_s16(y_filter + 2);
|
||||
|
||||
if (w == 4) {
|
||||
int16x4_t s0, s1, s2;
|
||||
load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
|
||||
src_ptr += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x4_t s3, s4, s5, s6;
|
||||
load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
|
||||
int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
|
||||
int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
|
||||
int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
|
||||
|
||||
uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
|
||||
uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
|
||||
|
||||
store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h != 0);
|
||||
} else {
|
||||
// Width is a multiple of 8 and height is a multiple of 4.
|
||||
do {
|
||||
int height = h;
|
||||
int16_t *s = src_ptr;
|
||||
uint8_t *d = dst_ptr;
|
||||
|
||||
int16x8_t s0, s1, s2;
|
||||
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
|
||||
s += 3 * src_stride;
|
||||
|
||||
do {
|
||||
int16x8_t s3, s4, s5, s6;
|
||||
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
|
||||
|
||||
uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
|
||||
uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
|
||||
uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
|
||||
uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
|
||||
|
||||
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s0 = s4;
|
||||
s1 = s5;
|
||||
s2 = s6;
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height != 0);
|
||||
src_ptr += 8;
|
||||
dst_ptr += 8;
|
||||
w -= 8;
|
||||
} while (w != 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
|
||||
#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
|
||||
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
|
||||
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
|
||||
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
|
||||
};
|
||||
|
||||
static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
|
||||
const int8x16_t filters,
|
||||
const uint8x16x3_t permute_tbl,
|
||||
int32x4_t horiz_const) {
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
|
||||
// { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
|
||||
uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
|
||||
vqtbl1q_u8(samples, permute_tbl.val[1]),
|
||||
vqtbl1q_u8(samples, permute_tbl.val[2]) };
|
||||
|
||||
int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
|
||||
sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1);
|
||||
sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2);
|
||||
|
||||
// Narrow and re-pack.
|
||||
return vshrn_n_s32(sum, ROUND0_BITS);
|
||||
}
|
||||
|
||||
static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
|
||||
const int8x16_t filters,
|
||||
const uint8x16x3_t permute_tbl,
|
||||
const int32x4_t horiz_const) {
|
||||
// Permute samples ready for dot product.
|
||||
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
|
||||
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
|
||||
// { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
|
||||
// {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
|
||||
uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
|
||||
vqtbl1q_u8(samples[0], permute_tbl.val[1]),
|
||||
vqtbl1q_u8(samples[0], permute_tbl.val[2]),
|
||||
vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
|
||||
|
||||
int32x4_t sum0123 =
|
||||
vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
|
||||
sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
|
||||
sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
|
||||
|
||||
int32x4_t sum4567 =
|
||||
vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0);
|
||||
sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
|
||||
sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
|
||||
|
||||
// Narrow and re-pack.
|
||||
return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
|
||||
vshrn_n_s32(sum4567, ROUND0_BITS));
|
||||
}
|
||||
|
||||
static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
|
||||
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
|
||||
const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
|
||||
const int16x4_t x_filter_8_11) {
|
||||
// The no-op filter should never be used here.
|
||||
assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
|
||||
|
||||
const int bd = 8;
|
||||
|
||||
// Narrow filter values to 8-bit.
|
||||
const int16x8x2_t x_filter_s16 = {
|
||||
{ x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
|
||||
};
|
||||
const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
|
||||
vmovn_s16(x_filter_s16.val[1]));
|
||||
// This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
|
||||
// - which are generally faster than rounding shifts on modern CPUs.
|
||||
const int32x4_t horiz_const =
|
||||
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
|
||||
const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
|
||||
|
||||
if (w <= 4) {
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
|
||||
|
||||
int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
|
||||
int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
|
||||
int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
|
||||
int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
|
||||
|
||||
store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 4);
|
||||
|
||||
do {
|
||||
uint8x16_t s0 = vld1q_u8(src_ptr);
|
||||
int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
|
||||
vst1_s16(dst_ptr, d0);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
} while (--h != 0);
|
||||
|
||||
} else {
|
||||
do {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
|
||||
do {
|
||||
uint8x16_t s0[2], s1[2], s2[2], s3[2];
|
||||
load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
|
||||
load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
|
||||
|
||||
int16x8_t d0 =
|
||||
convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
|
||||
int16x8_t d1 =
|
||||
convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
|
||||
int16x8_t d2 =
|
||||
convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
|
||||
int16x8_t d3 =
|
||||
convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
|
||||
|
||||
store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
|
||||
src_ptr += 4 * src_stride;
|
||||
dst_ptr += 4 * dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 4);
|
||||
|
||||
do {
|
||||
const uint8_t *s = src_ptr;
|
||||
int16_t *d = dst_ptr;
|
||||
int width = w;
|
||||
|
||||
do {
|
||||
uint8x16_t s0[2];
|
||||
s0[0] = vld1q_u8(s);
|
||||
s0[1] = vld1q_u8(s + 4);
|
||||
int16x8_t d0 =
|
||||
convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
|
||||
vst1q_s16(d, d0);
|
||||
|
||||
s += 8;
|
||||
d += 8;
|
||||
width -= 8;
|
||||
} while (width != 0);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_filter.h"
|
||||
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "av1/common/arm/highbd_convolve_sve2.h"
|
||||
#include "av1/common/arm/convolve_neon_i8mm.h"
|
||||
|
||||
static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
|
||||
int16x8_t s1[2],
|
||||
int16x8_t s2[2],
|
||||
int16x8_t filter_0_7,
|
||||
int16x8_t filter_4_11) {
|
||||
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
|
||||
sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
|
||||
sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
|
||||
|
||||
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
|
||||
sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
|
||||
sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
|
||||
|
||||
return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
|
||||
}
|
||||
|
||||
static INLINE void convolve_2d_sr_vert_12tap_sve2(
|
||||
const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr,
|
||||
const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
|
||||
const int16x8_t y_filter_4_11) {
|
||||
// The no-op filter should never be used here.
|
||||
assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
|
||||
|
||||
const int bd = 8;
|
||||
const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
|
||||
|
||||
uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
|
||||
// Scale indices by size of the true vector length to avoid reading from an
|
||||
// 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
|
||||
uint16x8_t correction0 =
|
||||
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
|
||||
merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
|
||||
|
||||
uint16x8_t correction1 =
|
||||
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
|
||||
merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
|
||||
|
||||
uint16x8_t correction2 =
|
||||
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
|
||||
merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
|
||||
|
||||
do {
|
||||
int16_t *s = (int16_t *)src_ptr;
|
||||
uint8_t *d = (uint8_t *)dst_ptr;
|
||||
int height = h;
|
||||
|
||||
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
|
||||
load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
|
||||
&s9, &sA);
|
||||
s += 11 * src_stride;
|
||||
|
||||
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
|
||||
s6789[2], s789A[2];
|
||||
// This operation combines a conventional transpose and the sample permute
|
||||
// required before computing the dot product.
|
||||
transpose_concat_4x4(s0, s1, s2, s3, s0123);
|
||||
transpose_concat_4x4(s1, s2, s3, s4, s1234);
|
||||
transpose_concat_4x4(s2, s3, s4, s5, s2345);
|
||||
transpose_concat_4x4(s3, s4, s5, s6, s3456);
|
||||
transpose_concat_4x4(s4, s5, s6, s7, s4567);
|
||||
transpose_concat_4x4(s5, s6, s7, s8, s5678);
|
||||
transpose_concat_4x4(s6, s7, s8, s9, s6789);
|
||||
transpose_concat_4x4(s7, s8, s9, sA, s789A);
|
||||
|
||||
do {
|
||||
int16x4_t sB, sC, sD, sE;
|
||||
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
|
||||
|
||||
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
|
||||
transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
|
||||
|
||||
// Merge new data into block from previous iteration.
|
||||
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
|
||||
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
|
||||
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
|
||||
|
||||
int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7,
|
||||
y_filter_4_11);
|
||||
int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7,
|
||||
y_filter_4_11);
|
||||
int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7,
|
||||
y_filter_4_11);
|
||||
int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7,
|
||||
y_filter_4_11);
|
||||
|
||||
int16x8_t dd01 =
|
||||
vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
|
||||
vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
|
||||
int16x8_t dd23 =
|
||||
vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
|
||||
vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
|
||||
|
||||
dd01 = vsubq_s16(dd01, sub_const);
|
||||
dd23 = vsubq_s16(dd23, sub_const);
|
||||
|
||||
uint8x8_t d01 = vqmovun_s16(dd01);
|
||||
uint8x8_t d23 = vqmovun_s16(dd23);
|
||||
|
||||
store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01);
|
||||
store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
|
||||
|
||||
// Prepare block for next iteration - re-using as much as possible.
|
||||
// Shuffle everything up four rows.
|
||||
s0123[0] = s4567[0];
|
||||
s0123[1] = s4567[1];
|
||||
s1234[0] = s5678[0];
|
||||
s1234[1] = s5678[1];
|
||||
s2345[0] = s6789[0];
|
||||
s2345[1] = s6789[1];
|
||||
s3456[0] = s789A[0];
|
||||
s3456[1] = s789A[1];
|
||||
s4567[0] = s89AB[0];
|
||||
s4567[1] = s89AB[1];
|
||||
s5678[0] = s9ABC[0];
|
||||
s5678[1] = s9ABC[1];
|
||||
s6789[0] = sABCD[0];
|
||||
s6789[1] = sABCD[1];
|
||||
s789A[0] = sBCDE[0];
|
||||
s789A[1] = sBCDE[1];
|
||||
|
||||
s += 4 * src_stride;
|
||||
d += 4 * dst_stride;
|
||||
height -= 4;
|
||||
} while (height != 0);
|
||||
src_ptr += 4;
|
||||
dst_ptr += 4;
|
||||
w -= 4;
|
||||
} while (w != 0);
|
||||
}
|
||||
|
||||
void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
int dst_stride, int w, int h,
|
||||
const InterpFilterParams *filter_params_x,
|
||||
const InterpFilterParams *filter_params_y,
|
||||
const int subpel_x_qn, const int subpel_y_qn,
|
||||
ConvolveParams *conv_params) {
|
||||
if (w == 2 || h == 2) {
|
||||
av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params_x, filter_params_y, subpel_x_qn,
|
||||
subpel_y_qn, conv_params);
|
||||
return;
|
||||
}
|
||||
|
||||
if (filter_params_x->taps > 8) {
|
||||
const int im_h = h + filter_params_y->taps - 1;
|
||||
const int im_stride = MAX_SB_SIZE;
|
||||
const int vert_offset = filter_params_x->taps / 2 - 1;
|
||||
const int horiz_offset = filter_params_x->taps / 2 - 1;
|
||||
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
|
||||
|
||||
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
|
||||
filter_params_x, subpel_x_qn & SUBPEL_MASK);
|
||||
const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
|
||||
filter_params_y, subpel_y_qn & SUBPEL_MASK);
|
||||
|
||||
DECLARE_ALIGNED(16, int16_t,
|
||||
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
|
||||
|
||||
const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
|
||||
const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
|
||||
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
|
||||
const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
|
||||
|
||||
convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
|
||||
im_stride, w, im_h, x_filter_0_7,
|
||||
x_filter_8_11);
|
||||
|
||||
convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h,
|
||||
y_filter_0_7, y_filter_4_11);
|
||||
} else {
|
||||
av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params_x, filter_params_y, subpel_x_qn,
|
||||
subpel_y_qn, conv_params);
|
||||
}
|
||||
}
|
|
@ -562,11 +562,12 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
|
|||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
|
||||
uint16_t *dst, ptrdiff_t dst_stride,
|
||||
int width, int height,
|
||||
const int16_t *filter_y, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
|
||||
ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width,
|
||||
int height, const int16_t *filter_y,
|
||||
int bd) {
|
||||
assert(width >= 4 && height >= 4);
|
||||
|
||||
const int16x8_t y_filter = vld1q_s16(filter_y);
|
||||
|
||||
|
@ -731,11 +732,12 @@ static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
|
|||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
|
||||
uint16_t *dst, ptrdiff_t dst_stride,
|
||||
int width, int height,
|
||||
const int16_t *filter_y, int bd) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
|
||||
ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width,
|
||||
int height, const int16_t *filter_y,
|
||||
int bd) {
|
||||
assert(width >= 4 && height >= 4);
|
||||
|
||||
const int16x8_t y_filter =
|
||||
vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
|
||||
|
@ -1346,13 +1348,11 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
|
|||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
|
||||
ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width,
|
||||
int height, const int16_t *filter_y,
|
||||
ConvolveParams *conv_params, int bd,
|
||||
const int y_offset) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
static void highbd_convolve_2d_sr_vert_8tap_sve2(
|
||||
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
|
||||
ConvolveParams *conv_params, int bd, const int y_offset) {
|
||||
assert(width >= 4 && height >= 4);
|
||||
const int64x2_t offset = vdupq_n_s64(y_offset);
|
||||
const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
|
||||
const int16x8_t y_filter = vld1q_s16(filter_y);
|
||||
|
@ -1536,13 +1536,11 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
|
|||
return vminq_u16(res, max);
|
||||
}
|
||||
|
||||
void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
|
||||
ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width,
|
||||
int height, const int16_t *filter_y,
|
||||
ConvolveParams *conv_params, int bd,
|
||||
const int y_offset) {
|
||||
assert(w >= 4 && h >= 4);
|
||||
static void highbd_convolve_2d_sr_vert_4tap_sve2(
|
||||
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
|
||||
ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
|
||||
ConvolveParams *conv_params, int bd, const int y_offset) {
|
||||
assert(width >= 4 && height >= 4);
|
||||
const int64x2_t offset = vdupq_n_s64(y_offset);
|
||||
const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include "aom_dsp/arm/sum_neon.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#define MAX_UPSAMPLE_SZ 16
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "av1/common/resize.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
#include "config/aom_scale_rtcd.h"
|
||||
|
||||
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
|
||||
|
|
|
@ -1124,10 +1124,10 @@ static void final_filter_fast_internal(uint16_t *A, int32_t *B,
|
|||
} while (h > 0);
|
||||
}
|
||||
|
||||
void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
|
||||
int16_t *src, const int src_stride, int32_t *dst,
|
||||
const int dst_stride, const int width,
|
||||
const int height) {
|
||||
static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
|
||||
int16_t *src, const int src_stride,
|
||||
int32_t *dst, const int dst_stride,
|
||||
const int width, const int height) {
|
||||
int16x8_t s0;
|
||||
int32_t *B_tmp, *dst_ptr;
|
||||
uint16_t *A_tmp;
|
||||
|
|
|
@ -470,7 +470,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
|
|||
add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
|
||||
specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
|
||||
add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
|
||||
specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
|
||||
specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon sve/;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -554,8 +554,13 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
|
|||
specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
|
||||
}
|
||||
|
||||
add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
|
||||
specialize qw/resize_vert_dir avx2/;
|
||||
add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
|
||||
specialize qw/av1_resize_vert_dir sse2 avx2/;
|
||||
|
||||
add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
|
||||
# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
|
||||
# passes under 32-bit valgrind.
|
||||
specialize qw/av1_resize_horz_dir avx2/;
|
||||
|
||||
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
|
||||
specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
|
||||
|
@ -597,13 +602,13 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
|
|||
|
||||
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
|
||||
|
||||
specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
|
||||
specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm sve2/;
|
||||
specialize qw/av1_convolve_2d_sr_intrabc neon/;
|
||||
specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
|
||||
specialize qw/av1_convolve_x_sr_intrabc neon/;
|
||||
specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
|
||||
specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
|
||||
specialize qw/av1_convolve_y_sr_intrabc neon/;
|
||||
specialize qw/av1_convolve_2d_scale sse4_1/;
|
||||
specialize qw/av1_convolve_2d_scale sse4_1 neon/;
|
||||
specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
|
||||
specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
|
||||
specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
|
||||
|
|
|
@ -159,8 +159,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
|
|||
CFL_PREDICT_FN(c, lbd)
|
||||
|
||||
#if CONFIG_AV1_HIGHBITDEPTH
|
||||
void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
|
||||
int alpha_q3, int bit_depth, int width, int height) {
|
||||
static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
|
||||
int dst_stride, int alpha_q3,
|
||||
int bit_depth, int width, int height) {
|
||||
for (int j = 0; j < height; j++) {
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = clip_pixel_highbd(
|
||||
|
|
|
@ -95,6 +95,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
|
|||
// will be constant allowing for loop unrolling and other constant propagated
|
||||
// goodness.
|
||||
#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \
|
||||
void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
|
||||
const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3); \
|
||||
void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
|
||||
const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \
|
||||
cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \
|
||||
|
@ -170,6 +172,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
|
|||
// will be constant allowing for loop unrolling and other constant propagated
|
||||
// goodness.
|
||||
#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
|
||||
void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
|
||||
int16_t *dst); \
|
||||
void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
|
||||
int16_t *dst) { \
|
||||
subtract_average_##arch(src, dst, width, height, round_offset, \
|
||||
|
@ -220,22 +224,21 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
|
|||
return sub_avg[tx_size % TX_SIZES_ALL]; \
|
||||
}
|
||||
|
||||
// For VSX SIMD optimization, the C versions of width == 4 subtract are
|
||||
// faster than the VSX. As such, the VSX code calls the C versions.
|
||||
void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
|
||||
void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
|
||||
void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
|
||||
|
||||
#define CFL_PREDICT_lbd(arch, width, height) \
|
||||
void cfl_predict_lbd_##width##x##height##_##arch( \
|
||||
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
|
||||
int alpha_q3) { \
|
||||
cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
|
||||
height); \
|
||||
#define CFL_PREDICT_lbd(arch, width, height) \
|
||||
void cfl_predict_lbd_##width##x##height##_##arch( \
|
||||
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \
|
||||
void cfl_predict_lbd_##width##x##height##_##arch( \
|
||||
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
|
||||
int alpha_q3) { \
|
||||
cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
|
||||
height); \
|
||||
}
|
||||
|
||||
#if CONFIG_AV1_HIGHBITDEPTH
|
||||
#define CFL_PREDICT_hbd(arch, width, height) \
|
||||
void cfl_predict_hbd_##width##x##height##_##arch( \
|
||||
const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
|
||||
int bd); \
|
||||
void cfl_predict_hbd_##width##x##height##_##arch( \
|
||||
const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
|
||||
int bd) { \
|
||||
|
|
|
@ -9,17 +9,21 @@
|
|||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "av1/common/debugmodes.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "av1/common/av1_common_int.h"
|
||||
#include "av1/common/blockd.h"
|
||||
#include "av1/common/enums.h"
|
||||
|
||||
#if 0
|
||||
static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
|
||||
fprintf(f, "%s", str);
|
||||
fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
|
||||
cm->show_frame, cm->quant_params.base_qindex);
|
||||
}
|
||||
|
||||
/* This function dereferences a pointer to the mbmi structure
|
||||
* and uses the passed in member offset to print out the value of an integer
|
||||
* for each mbmi member value in the mi structure.
|
||||
|
@ -87,6 +91,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
|
|||
|
||||
fclose(mvs);
|
||||
}
|
||||
#endif // 0
|
||||
|
||||
void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
|
||||
const char *filename) {
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_AV1_COMMON_DEBUGMODES_H_
|
||||
#define AOM_AV1_COMMON_DEBUGMODES_H_
|
||||
|
||||
#include "av1/common/av1_common_int.h"
|
||||
#include "av1/common/blockd.h"
|
||||
#include "av1/common/enums.h"
|
||||
|
||||
void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file);
|
||||
void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
|
||||
const char *filename);
|
||||
void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename);
|
||||
|
||||
#endif // AOM_AV1_COMMON_DEBUGMODES_H_
|
|
@ -124,6 +124,10 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
|
|||
|
||||
// Based on observation, for small blocks VSX does not outperform C (no 64bit
|
||||
// load and store intrinsics). So we call the C code for block widths 4.
|
||||
extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
|
||||
extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
|
||||
extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
|
||||
|
||||
cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
|
||||
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
|
||||
cfl_subtract_average_4x4_c, /* 4x4 */
|
||||
|
|
|
@ -337,8 +337,8 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
|
|||
return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
|
||||
}
|
||||
|
||||
static void down2_symeven(const uint8_t *const input, int length,
|
||||
uint8_t *output) {
|
||||
void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
|
||||
int start_offset) {
|
||||
// Actual filter len = 2 * filter_len_half.
|
||||
const int16_t *filter = av1_down2_symeven_half_filter;
|
||||
const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
|
||||
|
@ -350,7 +350,7 @@ static void down2_symeven(const uint8_t *const input, int length,
|
|||
l2 += (l2 & 1);
|
||||
if (l1 > l2) {
|
||||
// Short input length.
|
||||
for (i = 0; i < length; i += 2) {
|
||||
for (i = start_offset; i < length; i += 2) {
|
||||
int sum = (1 << (FILTER_BITS - 1));
|
||||
for (j = 0; j < filter_len_half; ++j) {
|
||||
sum +=
|
||||
|
@ -362,7 +362,7 @@ static void down2_symeven(const uint8_t *const input, int length,
|
|||
}
|
||||
} else {
|
||||
// Initial part.
|
||||
for (i = 0; i < l1; i += 2) {
|
||||
for (i = start_offset; i < l1; i += 2) {
|
||||
int sum = (1 << (FILTER_BITS - 1));
|
||||
for (j = 0; j < filter_len_half; ++j) {
|
||||
sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
|
||||
|
@ -492,7 +492,7 @@ static void resize_multistep(const uint8_t *const input, int length,
|
|||
if (filteredlength & 1)
|
||||
down2_symodd(in, filteredlength, out);
|
||||
else
|
||||
down2_symeven(in, filteredlength, out);
|
||||
down2_symeven(in, filteredlength, out, 0);
|
||||
filteredlength = proj_filteredlength;
|
||||
}
|
||||
if (filteredlength != olength) {
|
||||
|
@ -521,8 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
|
|||
}
|
||||
}
|
||||
|
||||
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
|
||||
int height, int height2, int width2, int start_col) {
|
||||
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
|
||||
int height, int height2, int width2, int start_col) {
|
||||
bool mem_status = true;
|
||||
uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
|
||||
uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
|
||||
|
@ -533,7 +533,7 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
|
|||
|
||||
for (int i = start_col; i < width2; ++i) {
|
||||
fill_col_to_arr(intbuf + i, width2, height, arrbuf);
|
||||
down2_symeven(arrbuf, height, arrbuf2);
|
||||
down2_symeven(arrbuf, height, arrbuf2, 0);
|
||||
fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
|
||||
}
|
||||
|
||||
|
@ -543,10 +543,12 @@ Error:
|
|||
return mem_status;
|
||||
}
|
||||
|
||||
void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
|
||||
int height, int filtered_length, int width2) {
|
||||
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride,
|
||||
uint8_t *intbuf, int height, int filtered_length,
|
||||
int width2) {
|
||||
for (int i = 0; i < height; ++i)
|
||||
down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
|
||||
down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i,
|
||||
0);
|
||||
}
|
||||
|
||||
bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
|
||||
|
@ -558,10 +560,10 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
|
|||
}
|
||||
|
||||
// Resize in the horizontal direction
|
||||
resize_horz_dir(input, in_stride, intbuf, height, width, width2);
|
||||
av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2);
|
||||
// Resize in the vertical direction
|
||||
bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
|
||||
width2, 0 /*start_col*/);
|
||||
bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height,
|
||||
height2, width2, 0 /*start_col*/);
|
||||
aom_free(intbuf);
|
||||
return mem_status;
|
||||
}
|
||||
|
|
|
@ -101,6 +101,9 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
|
|||
int in_stride, uint8_t *output, int height2,
|
||||
int width2, int out_stride);
|
||||
|
||||
void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
|
||||
int start_offset);
|
||||
|
||||
bool should_resize_by_half(int height, int width, int height2, int width2);
|
||||
|
||||
// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/x86/convolve_common_intrin.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "av1/common/convolve.h"
|
||||
|
||||
static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
|
||||
|
@ -200,31 +201,23 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
|
|||
if (w <= 4) {
|
||||
__m128i s[8], src6, res, res_round, res16;
|
||||
int res_int;
|
||||
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
|
||||
s[0] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
|
||||
s[1] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
|
||||
s[2] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
|
||||
s[3] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
|
||||
s[4] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
|
||||
s[5] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
|
||||
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
|
||||
xx_loadl_32(src_ptr + 1 * src_stride));
|
||||
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
|
||||
xx_loadl_32(src_ptr + 2 * src_stride));
|
||||
s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
|
||||
xx_loadl_32(src_ptr + 3 * src_stride));
|
||||
s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
|
||||
xx_loadl_32(src_ptr + 4 * src_stride));
|
||||
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
|
||||
xx_loadl_32(src_ptr + 5 * src_stride));
|
||||
src6 = xx_loadl_32(src_ptr + 6 * src_stride);
|
||||
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
|
||||
|
||||
do {
|
||||
s[6] = _mm_unpacklo_epi8(
|
||||
src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
|
||||
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
|
||||
s[7] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
|
||||
s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
|
||||
src6 = xx_loadl_32(src_ptr + 8 * src_stride);
|
||||
s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
|
||||
|
||||
res = convolve_lo_y(s + 0, coeffs);
|
||||
res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
#include "aom_dsp/x86/convolve_sse2.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
|
||||
void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst0, int dst_stride0, int w, int h,
|
||||
|
@ -178,31 +179,23 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
|
|||
|
||||
if (w == 4) {
|
||||
__m128i s[8], src6, res, res_shift;
|
||||
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
|
||||
s[0] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
|
||||
s[1] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
|
||||
s[2] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
|
||||
s[3] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
|
||||
s[4] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
|
||||
s[5] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
|
||||
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
|
||||
xx_loadl_32(src_ptr + 1 * src_stride));
|
||||
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
|
||||
xx_loadl_32(src_ptr + 2 * src_stride));
|
||||
s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
|
||||
xx_loadl_32(src_ptr + 3 * src_stride));
|
||||
s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
|
||||
xx_loadl_32(src_ptr + 4 * src_stride));
|
||||
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
|
||||
xx_loadl_32(src_ptr + 5 * src_stride));
|
||||
src6 = xx_loadl_32(src_ptr + 6 * src_stride);
|
||||
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
|
||||
|
||||
do {
|
||||
s[6] = _mm_unpacklo_epi8(
|
||||
src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
|
||||
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
|
||||
s[7] = _mm_unpacklo_epi8(
|
||||
_mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
|
||||
s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
|
||||
src6 = xx_loadl_32(src_ptr + 8 * src_stride);
|
||||
s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
|
||||
|
||||
res = convolve_lo_y(s + 0, coeffs);
|
||||
res_shift = _mm_sll_epi32(res, left_shift);
|
||||
|
|
|
@ -576,7 +576,7 @@ void av1_build_compound_diffwtd_mask_highbd_avx2(
|
|||
}
|
||||
}
|
||||
} else {
|
||||
const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
|
||||
const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
|
||||
if (mask_type == DIFFWTD_38_INV) {
|
||||
for (int i = 0; i < h; ++i) {
|
||||
for (int j = 0; j < w; j += 16) {
|
||||
|
|
|
@ -76,7 +76,7 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(
|
|||
}
|
||||
}
|
||||
} else {
|
||||
const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
|
||||
const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
|
||||
if (mask_type == DIFFWTD_38_INV) {
|
||||
for (int i = 0; i < h; ++i) {
|
||||
for (int j = 0; j < w; j += 8) {
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
s[8] = _mm256_unpackhi_epi8(s68, s79); \
|
||||
\
|
||||
__m256i res_out[2] = { 0 }; \
|
||||
resize_y_convolve(s, coeffs_y, res_out); \
|
||||
resize_convolve(s, coeffs_y, res_out); \
|
||||
\
|
||||
/* r00... r07 */ \
|
||||
__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
|
||||
|
@ -52,7 +52,7 @@
|
|||
res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
|
||||
\
|
||||
__m256i res_out_b[2] = { 0 }; \
|
||||
resize_y_convolve(s + 5, coeffs_y, res_out_b); \
|
||||
resize_convolve(s + 5, coeffs_y, res_out_b); \
|
||||
\
|
||||
/* r08... r015 */ \
|
||||
__m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
|
||||
|
@ -91,7 +91,7 @@
|
|||
s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \
|
||||
\
|
||||
__m256i res_out[2] = { 0 }; \
|
||||
resize_y_convolve(s, coeffs_y, res_out); \
|
||||
resize_convolve(s, coeffs_y, res_out); \
|
||||
\
|
||||
/* r00... r07 */ \
|
||||
__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
|
||||
|
@ -108,9 +108,107 @@
|
|||
res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \
|
||||
res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
|
||||
|
||||
static INLINE void resize_y_convolve(const __m256i *const s,
|
||||
const __m256i *const coeffs,
|
||||
__m256i *res_out) {
|
||||
#define PROCESS_RESIZE_X_WD32 \
|
||||
/* a0 a1 ..... a30 a31 */ \
|
||||
__m256i row0 = _mm256_loadu_si256( \
|
||||
(__m256i *)&input[i * in_stride + j - filter_offset]); \
|
||||
/* b0 b1 ..... b30 b31 */ \
|
||||
__m256i row1 = _mm256_loadu_si256( \
|
||||
(__m256i *)&input[(i + 1) * in_stride + j - filter_offset]); \
|
||||
/* a0 .... a15 || b0.... b15 */ \
|
||||
__m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); \
|
||||
/* a16 .... a31 || b16 .... b31 */ \
|
||||
__m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); \
|
||||
filter_offset = 3; \
|
||||
\
|
||||
/* Pad start pixels to the left, while processing the first pixels in the \
|
||||
row. */ \
|
||||
if (j == 0) { \
|
||||
/* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \
|
||||
row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \
|
||||
/* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */ \
|
||||
row1 = _mm256_alignr_epi8(r1, r0, 13); \
|
||||
r0 = row0; \
|
||||
r1 = row1; \
|
||||
} \
|
||||
\
|
||||
/* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \
|
||||
__m128i row0_0 = _mm_loadl_epi64( \
|
||||
(__m128i *)&input[i * in_stride + 32 + j - filter_offset]); \
|
||||
/* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \
|
||||
__m128i row1_0 = _mm_loadl_epi64( \
|
||||
(__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]); \
|
||||
__m256i r2 = _mm256_permute2x128_si256( \
|
||||
_mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \
|
||||
\
|
||||
/* Pad end pixels to the right, while processing the last pixels in the \
|
||||
row. */ \
|
||||
const int is_last_cols32 = (j + 32 == filtered_length); \
|
||||
if (is_last_cols32) { \
|
||||
r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask); \
|
||||
} \
|
||||
\
|
||||
/* Process even pixels of the first row */ \
|
||||
/* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */ \
|
||||
s0[0] = _mm256_alignr_epi8(r1, r0, 0); \
|
||||
/* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */ \
|
||||
s0[1] = _mm256_alignr_epi8(r1, r0, 2); \
|
||||
/* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */ \
|
||||
s0[2] = _mm256_alignr_epi8(r1, r0, 4); \
|
||||
/* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */ \
|
||||
s0[3] = _mm256_alignr_epi8(r1, r0, 6); \
|
||||
\
|
||||
/* Process even pixels of the second row */ \
|
||||
/* a13 a14 a15 a16 ..... a28 | b13 b14 b15 b16 ..... b28 */ \
|
||||
s1[0] = _mm256_alignr_epi8(r2, r1, 0); \
|
||||
/* a15 a16 a17 a18 ..... a30 | b15 b16 b17 b18 ..... b30 */ \
|
||||
s1[1] = _mm256_alignr_epi8(r2, r1, 2); \
|
||||
/* a17 a18 a19 a20 ..... a32 | b17 b18 b19 b20 ..... b32 */ \
|
||||
s1[2] = _mm256_alignr_epi8(r2, r1, 4); \
|
||||
/* a19 a20 a21 a22 ..... a34 | b19 b20 b21 b22 ..... b34 */ \
|
||||
s1[3] = _mm256_alignr_epi8(r2, r1, 6); \
|
||||
\
|
||||
/* The register res_out_0 stores the result of start-16 pixels corresponding \
|
||||
to the first and second rows whereas res_out_1 stores the end-16 pixels. */ \
|
||||
__m256i res_out_0[2], res_out_1[2]; \
|
||||
res_out_1[0] = res_out_1[1] = zero; \
|
||||
res_out_0[0] = res_out_0[1] = zero; \
|
||||
resize_convolve(s0, coeffs_x, res_out_0); \
|
||||
resize_convolve(s1, coeffs_x, res_out_1); \
|
||||
\
|
||||
/* Result of 32 pixels of row0 (a0 to a32) */ \
|
||||
res_out_0[0] = _mm256_sra_epi32( \
|
||||
_mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); \
|
||||
res_out_1[0] = _mm256_sra_epi32( \
|
||||
_mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits); \
|
||||
/* r00-r03 r08-r011 | r04-r07 r012-r015 */ \
|
||||
__m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \
|
||||
\
|
||||
/* result of 32 pixels of row1 (b0 to b32) */ \
|
||||
res_out_0[1] = _mm256_sra_epi32( \
|
||||
_mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \
|
||||
res_out_1[1] = _mm256_sra_epi32( \
|
||||
_mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits); \
|
||||
/* r10-r13 r18-r111 | r14-r17 r112-r115 */ \
|
||||
__m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]); \
|
||||
\
|
||||
/* Convert the result from 16bit to 8bit */ \
|
||||
/* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115 \
|
||||
*/ \
|
||||
__m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1); \
|
||||
__m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel); \
|
||||
res_out_row01 = _mm256_max_epu8(res_out_r01, zero); \
|
||||
__m128i low_128 = CAST_LOW(res_out_row01); \
|
||||
__m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1); \
|
||||
\
|
||||
_mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2], \
|
||||
_mm_unpacklo_epi32(low_128, high_128)); \
|
||||
_mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2], \
|
||||
_mm_unpackhi_epi32(low_128, high_128));
|
||||
|
||||
static INLINE void resize_convolve(const __m256i *const s,
|
||||
const __m256i *const coeffs,
|
||||
__m256i *res_out) {
|
||||
const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
|
||||
const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
|
||||
const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
|
||||
|
@ -152,8 +250,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
|
|||
coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
|
||||
}
|
||||
|
||||
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
||||
int height, int height2, int stride, int start_col) {
|
||||
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
||||
int height, int height2, int stride,
|
||||
int start_col) {
|
||||
assert(start_col <= stride);
|
||||
// For the GM tool, the input layer height or width is assured to be an even
|
||||
// number. Hence the function 'down2_symodd()' is not invoked and SIMD
|
||||
|
@ -164,8 +263,8 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
|||
// eliminate the need for conditional statements within the subsequent SIMD
|
||||
// code to manage these cases.
|
||||
if (height & 1 || height < 8) {
|
||||
return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, start_col);
|
||||
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, start_col);
|
||||
}
|
||||
|
||||
__m256i s[10], coeffs_y[4];
|
||||
|
@ -174,7 +273,7 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
|||
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
|
||||
const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
|
||||
const uint8_t max_pixel = 255;
|
||||
const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
|
||||
const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
|
||||
|
@ -404,8 +503,212 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
|||
}
|
||||
|
||||
if (remain_col)
|
||||
return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, stride - remain_col);
|
||||
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, stride - remain_col);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Masks used for width 32 and 8 pixels, with left and right padding
|
||||
// requirements
|
||||
static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2, 3, 4,
|
||||
5, 6, 7, 8, 9, 10, 11, 12,
|
||||
0, 0, 0, 0, 1, 2, 3, 4,
|
||||
5, 6, 7, 8, 9, 10, 11, 12 };
|
||||
|
||||
static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 1, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2 };
|
||||
|
||||
static const uint8_t wd8_right_padding_mask[32] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
|
||||
};
|
||||
|
||||
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
|
||||
uint8_t *intbuf, int height, int filtered_length,
|
||||
int width2) {
|
||||
assert(height % 2 == 0);
|
||||
// Invoke C for width less than 32.
|
||||
// TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
|
||||
// passes under 32-bit valgrind.
|
||||
if (filtered_length < 32) {
|
||||
av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
|
||||
width2);
|
||||
return;
|
||||
}
|
||||
|
||||
const int filt_length = sizeof(av1_down2_symeven_half_filter);
|
||||
assert(filt_length % 2 == 0);
|
||||
(void)filt_length;
|
||||
|
||||
__m256i s0[4], s1[4], coeffs_x[4];
|
||||
|
||||
const int bits = FILTER_BITS;
|
||||
const int dst_stride = width2;
|
||||
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
|
||||
const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
|
||||
|
||||
const uint8_t max_pixel = 255;
|
||||
const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
const __m256i wd32_start_pad_mask =
|
||||
_mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
|
||||
const __m256i wd32_end_pad_mask =
|
||||
_mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
|
||||
const __m256i wd8_end_pad_mask =
|
||||
_mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
|
||||
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
|
||||
|
||||
// The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
|
||||
// to generate output corresponding to 2 rows. To streamline the core loop and
|
||||
// eliminate the need for conditional checks, the remaining columns (16 or 8)
|
||||
// are processed separately.
|
||||
if (filtered_length % 32 == 0) {
|
||||
for (int i = 0; i < height; i += 2) {
|
||||
int filter_offset = 0;
|
||||
for (int j = 0; j < filtered_length; j += 32) {
|
||||
PROCESS_RESIZE_X_WD32
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i += 2) {
|
||||
int filter_offset = 0;
|
||||
int remain_col = filtered_length % 32;
|
||||
for (int j = 0; j + 32 <= filtered_length; j += 32) {
|
||||
PROCESS_RESIZE_X_WD32
|
||||
}
|
||||
|
||||
int wd_processed = filtered_length - remain_col;
|
||||
if (remain_col > 15) {
|
||||
remain_col = filtered_length % 16;
|
||||
const int in_idx = i * in_stride + wd_processed - filter_offset;
|
||||
const int out_idx = (i * dst_stride) + wd_processed / 2;
|
||||
// a0 a1 --- a15
|
||||
__m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
|
||||
// b0 b1 --- b15
|
||||
__m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
|
||||
// a0 a1 --- a15 || b0 b1 --- b15
|
||||
__m256i r0 =
|
||||
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
|
||||
|
||||
// a16 a17 --- a23
|
||||
row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
|
||||
// b16 b17 --- b23
|
||||
row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
|
||||
|
||||
// a16-a23 x x x x| b16-b23 x x x x
|
||||
__m256i r1 =
|
||||
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
|
||||
|
||||
// Pad end pixels to the right, while processing the last pixels in the
|
||||
// row.
|
||||
const int is_last_cols16 = wd_processed + 16 == filtered_length;
|
||||
if (is_last_cols16) {
|
||||
r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
|
||||
}
|
||||
|
||||
// a0 a1 --- a15 || b0 b1 --- b15
|
||||
s0[0] = r0;
|
||||
// a2 a3 --- a17 || b2 b3 --- b17
|
||||
s0[1] = _mm256_alignr_epi8(r1, r0, 2);
|
||||
// a4 a5 --- a19 || b4 b5 --- b19
|
||||
s0[2] = _mm256_alignr_epi8(r1, r0, 4);
|
||||
// a6 a7 --- a21 || b6 b7 --- b21
|
||||
s0[3] = _mm256_alignr_epi8(r1, r0, 6);
|
||||
|
||||
// result for 16 pixels (a0 to a15) of row0 and row1
|
||||
__m256i res_out_0[2];
|
||||
res_out_0[0] = res_out_0[1] = zero;
|
||||
resize_convolve(s0, coeffs_x, res_out_0);
|
||||
|
||||
// r00 -r07
|
||||
res_out_0[0] = _mm256_sra_epi32(
|
||||
_mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
|
||||
// r10-r17
|
||||
res_out_0[1] = _mm256_sra_epi32(
|
||||
_mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
|
||||
// r00-r03 r10-r13 r04-r07 r14-r17
|
||||
__m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
|
||||
// r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
|
||||
res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
|
||||
res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
|
||||
res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
|
||||
// r00-r03 r10-r13 r04-r07 r14-r17
|
||||
__m128i low_result =
|
||||
CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
|
||||
// r00-r03 r04-r07 r10-r13 r14-r17
|
||||
low_result = _mm_shuffle_epi32(low_result, 0xd8);
|
||||
|
||||
_mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
|
||||
_mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
|
||||
_mm_unpackhi_epi64(low_result, low_result));
|
||||
}
|
||||
|
||||
wd_processed = filtered_length - remain_col;
|
||||
if (remain_col > 7) {
|
||||
remain_col = filtered_length % 8;
|
||||
const int in_idx = i * in_stride + wd_processed - filter_offset;
|
||||
const int out_idx = (i * dst_stride) + wd_processed / 2;
|
||||
// a0 a1 --- a15
|
||||
__m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
|
||||
// b0 b1 --- b15
|
||||
__m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
|
||||
// a0 a1 --- a15 || b0 b1 --- b15
|
||||
__m256i r0 =
|
||||
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
|
||||
|
||||
// Pad end pixels to the right, while processing the last pixels in the
|
||||
// row.
|
||||
const int is_last_cols_8 = wd_processed + 8 == filtered_length;
|
||||
if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
|
||||
|
||||
// a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
|
||||
s0[0] = r0;
|
||||
// a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
|
||||
s0[1] = _mm256_bsrli_epi128(r0, 2);
|
||||
// a4 a5 a6 a7 a8 a9 a10 a10 | b4 b5 b6 b7 b8 b9 b10 b10
|
||||
s0[2] = _mm256_bsrli_epi128(r0, 4);
|
||||
// a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
|
||||
s0[3] = _mm256_bsrli_epi128(r0, 6);
|
||||
__m256i res_out_0[2];
|
||||
res_out_0[0] = res_out_0[1] = zero;
|
||||
resize_convolve(s0, coeffs_x, res_out_0);
|
||||
|
||||
// r00 - r03 | r10 - r13
|
||||
__m256i res_out =
|
||||
_mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
|
||||
// r00 - r03 | r10 - r13
|
||||
res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
|
||||
round_shift_bits);
|
||||
// r00-r03 r00-r03 r10-r13 r10-r13
|
||||
__m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
|
||||
// r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
|
||||
res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
|
||||
res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
|
||||
res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
|
||||
|
||||
xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
|
||||
xx_storel_32(intbuf + out_idx + dst_stride,
|
||||
_mm256_extracti128_si256(res_out_row01, 1));
|
||||
}
|
||||
|
||||
wd_processed = filtered_length - remain_col;
|
||||
// When the remaining width is 2, the above code would not have taken
|
||||
// care of padding required for (filtered_length - 4)th pixel. Hence,
|
||||
// process that pixel again with the C code.
|
||||
wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
|
||||
if (remain_col) {
|
||||
const int in_idx = (in_stride * i);
|
||||
const int out_idx = (wd_processed / 2) + width2 * i;
|
||||
|
||||
down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
|
||||
wd_processed);
|
||||
down2_symeven(input + in_idx + in_stride, filtered_length,
|
||||
intbuf + out_idx + width2, wd_processed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "av1/common/resize.h"
|
||||
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
|
||||
#define PROCESS_RESIZE_Y_WD8 \
|
||||
/* ah0 ah1 ... ah7 */ \
|
||||
const __m128i AH = _mm_add_epi16(l0, l7); \
|
||||
/* bg0 bg1 ... bh7 */ \
|
||||
const __m128i BG = _mm_add_epi16(l1, l6); \
|
||||
/* cf0 cf1 ... cf7 */ \
|
||||
const __m128i CF = _mm_add_epi16(l2, l5); \
|
||||
/* de0 de1 ... de7 */ \
|
||||
const __m128i DE = _mm_add_epi16(l3, l4); \
|
||||
\
|
||||
/* ah0 bg0 ... ah3 bg3 */ \
|
||||
const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \
|
||||
/*cf0 de0 ... cf2 de2 */ \
|
||||
const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \
|
||||
\
|
||||
/* ah4 bg4... ah7 bg7 */ \
|
||||
const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \
|
||||
/* cf4 de4... cf7 de7 */ \
|
||||
const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \
|
||||
\
|
||||
/* r00 r01 r02 r03 */ \
|
||||
const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \
|
||||
const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \
|
||||
__m128i r0 = _mm_add_epi32(r00, r01); \
|
||||
/* r04 r05 r06 r07 */ \
|
||||
const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \
|
||||
const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \
|
||||
__m128i r1 = _mm_add_epi32(r10, r11); \
|
||||
\
|
||||
r0 = _mm_add_epi32(r0, round_const_bits); \
|
||||
r1 = _mm_add_epi32(r1, round_const_bits); \
|
||||
r0 = _mm_sra_epi32(r0, round_shift_bits); \
|
||||
r1 = _mm_sra_epi32(r1, round_shift_bits); \
|
||||
\
|
||||
/* r00 ... r07 (8 values of each 16bit) */ \
|
||||
const __m128i res_16b = _mm_packs_epi32(r0, r1); \
|
||||
/* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \
|
||||
const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \
|
||||
\
|
||||
__m128i res = _mm_min_epu8(res_8b0, clip_pixel); \
|
||||
res = _mm_max_epu8(res, zero); \
|
||||
_mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
|
||||
\
|
||||
l0 = l2; \
|
||||
l1 = l3; \
|
||||
l2 = l4; \
|
||||
l3 = l5; \
|
||||
l4 = l6; \
|
||||
l5 = l7; \
|
||||
data += 2 * stride;
|
||||
|
||||
static INLINE void prepare_filter_coeffs(const int16_t *filter,
|
||||
__m128i *const coeffs /* [2] */) {
|
||||
// f0 f1 f2 f3 x x x x
|
||||
const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
|
||||
|
||||
// f1 f0 f3 f2 x x x x
|
||||
const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
|
||||
|
||||
// f3 f2 f3 f2 ...
|
||||
coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
|
||||
// f1 f0 f1 f0 ...
|
||||
coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
|
||||
}
|
||||
|
||||
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
|
||||
int height, int height2, int stride,
|
||||
int start_col) {
|
||||
// For the GM tool, the input layer height or width is assured to be an even
|
||||
// number. Hence the function 'down2_symodd()' is not invoked and SIMD
|
||||
// optimization of the same is not implemented.
|
||||
// When the input height is less than 8 and even, the potential input
|
||||
// heights are limited to 2, 4, or 6. These scenarios require seperate
|
||||
// handling due to padding requirements. Invoking the C function here will
|
||||
// eliminate the need for conditional statements within the subsequent SIMD
|
||||
// code to manage these cases.
|
||||
if (height & 1 || height < 8) {
|
||||
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, start_col);
|
||||
}
|
||||
|
||||
__m128i coeffs_y[2];
|
||||
const int bits = FILTER_BITS;
|
||||
const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
|
||||
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
|
||||
const uint8_t max_pixel = 255;
|
||||
const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
|
||||
|
||||
const int remain_col = stride % 8;
|
||||
|
||||
for (int j = start_col; j < stride - remain_col; j += 8) {
|
||||
uint8_t *data = &intbuf[j];
|
||||
// d0 ... d7
|
||||
const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
|
||||
// Padding top 3 rows with the last available row at the top.
|
||||
// a0 ... a7
|
||||
const __m128i l8_0 = l8_3;
|
||||
// b0 ... b7
|
||||
const __m128i l8_1 = l8_3;
|
||||
// c0 ... c7
|
||||
const __m128i l8_2 = l8_3;
|
||||
// e0 ... e7
|
||||
const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
|
||||
// f0 ... f7
|
||||
const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
|
||||
|
||||
// Convert to 16bit as addition of 2 source pixel crosses 8 bit.
|
||||
__m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit)
|
||||
__m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit)
|
||||
__m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit)
|
||||
__m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit)
|
||||
__m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit)
|
||||
__m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit)
|
||||
|
||||
// Increment the pointer such that the loading starts from row G.
|
||||
data = data + 3 * stride;
|
||||
// The core vertical SIMD processes 2 input rows simultaneously to generate
|
||||
// output corresponding to 1 row. To streamline the core loop and eliminate
|
||||
// the need for conditional checks, the remaining rows 4 are processed
|
||||
// separately.
|
||||
for (int i = 0; i < height - 4; i += 2) {
|
||||
// g0 ... g7
|
||||
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
|
||||
// h0 ... h7
|
||||
__m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
|
||||
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b)
|
||||
__m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b)
|
||||
|
||||
PROCESS_RESIZE_Y_WD8
|
||||
}
|
||||
|
||||
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
|
||||
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
|
||||
// Process the last 4 input rows here.
|
||||
for (int i = height - 4; i < height; i += 2) {
|
||||
__m128i l7 = l6;
|
||||
PROCESS_RESIZE_Y_WD8
|
||||
}
|
||||
}
|
||||
|
||||
if (remain_col)
|
||||
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
|
||||
stride, stride - remain_col);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Blends a and b using mask and returns the result.
|
||||
static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
|
||||
const __m128i masked_b = _mm_and_si128(mask, b);
|
||||
const __m128i masked_a = _mm_andnot_si128(mask, a);
|
||||
return (_mm_or_si128(masked_a, masked_b));
|
||||
}
|
||||
|
||||
// Masks used for width 16 pixels, with left and right padding
|
||||
// requirements.
|
||||
static const uint8_t left_padding_mask[16] = {
|
||||
255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const uint8_t right_padding_mask[16] = { 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 255, 255,
|
||||
255, 255, 255, 255 };
|
||||
|
||||
static const uint8_t mask_16[16] = {
|
||||
255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
|
||||
};
|
||||
|
||||
void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
|
||||
uint8_t *intbuf, int height, int filtered_length,
|
||||
int width2) {
|
||||
assert(height % 2 == 0);
|
||||
// Invoke C for width less than 16.
|
||||
if (filtered_length < 16) {
|
||||
av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
|
||||
width2);
|
||||
return;
|
||||
}
|
||||
|
||||
__m128i coeffs_x[2];
|
||||
const int bits = FILTER_BITS;
|
||||
const int dst_stride = width2;
|
||||
const int remain_col = filtered_length % 16;
|
||||
const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
|
||||
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
|
||||
|
||||
const uint8_t max_pixel = 255;
|
||||
const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
|
||||
const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
|
||||
const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
|
||||
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
|
||||
|
||||
for (int i = 0; i < height; ++i) {
|
||||
int filter_offset = 0;
|
||||
for (int j = 0; j <= filtered_length - 16; j += 16) {
|
||||
const int in_idx = i * in_stride + j - filter_offset;
|
||||
const int out_idx = i * dst_stride + j / 2;
|
||||
|
||||
// a0 a1 a2 a3 .... a15
|
||||
__m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
|
||||
// a8 a9 a10 a11 .... a23
|
||||
__m128i row01 =
|
||||
_mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
|
||||
filter_offset = 3;
|
||||
|
||||
// Pad start pixels to the left, while processing the first pixels in the
|
||||
// row.
|
||||
if (j == 0) {
|
||||
const __m128i start_pixel_row0 =
|
||||
_mm_set1_epi8((char)input[i * in_stride]);
|
||||
row00 =
|
||||
blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
|
||||
}
|
||||
|
||||
// Pad end pixels to the right, while processing the last pixels in the
|
||||
// row.
|
||||
const int is_last_cols16 = (j == filtered_length - 16);
|
||||
if (is_last_cols16) {
|
||||
const __m128i end_pixel_row0 =
|
||||
_mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
|
||||
row01 = blend(row01, end_pixel_row0, end_pad_mask);
|
||||
}
|
||||
|
||||
// a2 a3 a4 a5 a6 a7 a8 a9 .... a17
|
||||
const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
|
||||
_mm_srli_si128(row01, 2));
|
||||
// a4 a5 a6 a7 a9 10 a11 a12 .... a19
|
||||
const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
|
||||
_mm_srli_si128(row01, 4));
|
||||
// a6 a7 a8 a9 a10 a11 a12 a13 .... a21
|
||||
const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
|
||||
_mm_srli_si128(row01, 6));
|
||||
|
||||
// a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
|
||||
const __m128i s0 = _mm_and_si128(row00, mask_even);
|
||||
// a1 a3 a5 a7 a9 a11 a13 a15
|
||||
const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
|
||||
// a2 a4 a6 a8 a10 a12 a14 a16
|
||||
const __m128i s2 = _mm_and_si128(row0_1, mask_even);
|
||||
// a3 a5 a7 a9 a11 a13 a15 a17
|
||||
const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
|
||||
// a4 a6 a8 a10 a12 a14 a16 a18
|
||||
const __m128i s4 = _mm_and_si128(row0_2, mask_even);
|
||||
// a5 a7 a9 a11 a13 a15 a17 a19
|
||||
const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
|
||||
// a6 a8 a10 a12 a14 a16 a18 a20
|
||||
const __m128i s6 = _mm_and_si128(row0_3, mask_even);
|
||||
// a7 a9 a11 a13 a15 a17 a19 a21
|
||||
const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
|
||||
|
||||
// a0a7 a2a9 a4a11 .... a12a19 a14a21
|
||||
const __m128i s07 = _mm_add_epi16(s0, s7);
|
||||
// a1a6 a3a8 a5a10 .... a13a18 a15a20
|
||||
const __m128i s16 = _mm_add_epi16(s1, s6);
|
||||
// a2a5 a4a7 a6a9 .... a14a17 a16a19
|
||||
const __m128i s25 = _mm_add_epi16(s2, s5);
|
||||
// a3a4 a5a6 a7a8 .... a15a16 a17a18
|
||||
const __m128i s34 = _mm_add_epi16(s3, s4);
|
||||
|
||||
// a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
|
||||
const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
|
||||
// a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
|
||||
const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
|
||||
|
||||
// a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
|
||||
const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
|
||||
// a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
|
||||
const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
|
||||
|
||||
const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
|
||||
const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
|
||||
const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
|
||||
const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
|
||||
|
||||
// Result of first 8 pixels of row0 (a0 to a7).
|
||||
// r0_0 r0_1 r0_2 r0_3
|
||||
__m128i r00 = _mm_add_epi32(r01_0, r01_1);
|
||||
r00 = _mm_add_epi32(r00, round_const_bits);
|
||||
r00 = _mm_sra_epi32(r00, round_shift_bits);
|
||||
|
||||
// Result of next 8 pixels of row0 (a8 to 15).
|
||||
// r0_4 r0_5 r0_6 r0_7
|
||||
__m128i r01 = _mm_add_epi32(r01_2, r01_3);
|
||||
r01 = _mm_add_epi32(r01, round_const_bits);
|
||||
r01 = _mm_sra_epi32(r01, round_shift_bits);
|
||||
|
||||
// r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
|
||||
const __m128i res_16 = _mm_packs_epi32(r00, r01);
|
||||
const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
|
||||
__m128i res = _mm_min_epu8(res_8, clip_pixel);
|
||||
res = _mm_max_epu8(res, zero);
|
||||
|
||||
// r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
|
||||
_mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
|
||||
}
|
||||
|
||||
int wd_processed = filtered_length - remain_col;
|
||||
// When the remaining width is 2, the above code would not have taken
|
||||
// care of padding required for (filtered_length - 4)th pixel. Hence,
|
||||
// process that pixel again with the C code.
|
||||
wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
|
||||
if (remain_col) {
|
||||
const int in_idx = (in_stride * i);
|
||||
const int out_idx = (wd_processed / 2) + width2 * i;
|
||||
|
||||
down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
|
||||
wd_processed);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2241,6 +2241,12 @@ static AOM_INLINE void get_ls_tile_buffer(
|
|||
if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
|
||||
// The remaining bits in the top byte signal the row offset
|
||||
int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
|
||||
if (offset > row) {
|
||||
aom_internal_error(
|
||||
error_info, AOM_CODEC_CORRUPT_FRAME,
|
||||
"Invalid row offset in tile copy mode: row=%d offset=%d", row,
|
||||
offset);
|
||||
}
|
||||
|
||||
// Currently, only use tiles in same column as reference tiles.
|
||||
copy_data = tile_buffers[row - offset][col].data;
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
|
@ -63,7 +64,7 @@ int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
|
|||
}
|
||||
|
||||
int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
|
||||
int block_size) {
|
||||
intptr_t block_size) {
|
||||
uint64x2_t err_u64 = vdupq_n_u64(0);
|
||||
|
||||
assert(block_size >= 16);
|
|
@ -12,6 +12,7 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
|
||||
|
@ -49,7 +50,7 @@ int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
|
|||
}
|
||||
|
||||
int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
|
||||
int block_size) {
|
||||
intptr_t block_size) {
|
||||
if (block_size % 32 == 0) {
|
||||
int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
|
||||
vdupq_n_s64(0) };
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
#include "aom_dsp/arm/sum_neon.h"
|
||||
#include "config/aom_config.h"
|
||||
#include "config/aom_dsp_rtcd.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
|
||||
const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
|
|
@ -19,6 +19,7 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#define CRC_LOOP(op, crc, type, buf, len) \
|
||||
while ((len) >= sizeof(type)) { \
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/sum_neon.h"
|
||||
#include "av1/encoder/arm/neon/pickrst_neon.h"
|
||||
#include "av1/encoder/arm/pickrst_neon.h"
|
||||
#include "av1/encoder/pickrst.h"
|
||||
|
||||
static INLINE void highbd_calc_proj_params_r0_r1_neon(
|
|
@ -0,0 +1,441 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <arm_sve.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "config/aom_config.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
|
||||
#include "aom_dsp/arm/mem_neon.h"
|
||||
#include "aom_dsp/arm/sum_neon.h"
|
||||
#include "aom_dsp/arm/transpose_neon.h"
|
||||
#include "av1/common/restoration.h"
|
||||
#include "av1/encoder/pickrst.h"
|
||||
#include "av1/encoder/arm/pickrst_sve.h"
|
||||
|
||||
static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
|
||||
int width, int height) {
|
||||
uint64x2_t avg_u64 = vdupq_n_u64(0);
|
||||
uint16x8_t ones = vdupq_n_u16(1);
|
||||
|
||||
// Use a predicate to compute the last columns.
|
||||
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
|
||||
|
||||
int h = height;
|
||||
do {
|
||||
int j = width;
|
||||
const uint16_t *src_ptr = src;
|
||||
while (j > 8) {
|
||||
uint16x8_t s = vld1q_u16(src_ptr);
|
||||
avg_u64 = aom_udotq_u16(avg_u64, s, ones);
|
||||
|
||||
j -= 8;
|
||||
src_ptr += 8;
|
||||
}
|
||||
uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr));
|
||||
avg_u64 = aom_udotq_u16(avg_u64, s_end, ones);
|
||||
|
||||
src += src_stride;
|
||||
} while (--h != 0);
|
||||
return (uint16_t)(vaddvq_u64(avg_u64) / (width * height));
|
||||
}
|
||||
|
||||
static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
|
||||
int16_t avg, int16_t *buf_avg,
|
||||
int buf_avg_stride, int width, int height) {
|
||||
uint16x8_t avg_u16 = vdupq_n_u16(avg);
|
||||
|
||||
// Use a predicate to compute the last columns.
|
||||
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
|
||||
|
||||
uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg));
|
||||
|
||||
do {
|
||||
int j = width;
|
||||
const uint16_t *buf_ptr = buf;
|
||||
int16_t *buf_avg_ptr = buf_avg;
|
||||
while (j > 8) {
|
||||
uint16x8_t d = vld1q_u16(buf_ptr);
|
||||
vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16)));
|
||||
|
||||
j -= 8;
|
||||
buf_ptr += 8;
|
||||
buf_avg_ptr += 8;
|
||||
}
|
||||
uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr));
|
||||
vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end)));
|
||||
|
||||
buf += buf_stride;
|
||||
buf_avg += buf_avg_stride;
|
||||
} while (--height > 0);
|
||||
}
|
||||
|
||||
static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
|
||||
const int wiener_win2,
|
||||
const int divider) {
|
||||
for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
|
||||
// Transpose the first 2x2 square. It needs a special case as the element
|
||||
// of the bottom left is on the diagonal.
|
||||
int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
|
||||
int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
|
||||
|
||||
int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
|
||||
|
||||
vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
|
||||
vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
|
||||
|
||||
// Transpose and store all the remaining 2x2 squares of the line.
|
||||
for (int j = i + 3; j < wiener_win2; j = j + 2) {
|
||||
row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
|
||||
row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
|
||||
|
||||
int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
|
||||
int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
|
||||
|
||||
vst1q_s64(H_tmp + (j + 0) * wiener_win2 + i, tr_row0);
|
||||
vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
|
||||
H[i] += H_tmp[i] / divider;
|
||||
}
|
||||
}
|
||||
|
||||
// Transpose the matrix that has just been computed and accumulate it in M.
|
||||
static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
|
||||
const int wiener_win, const int divider) {
|
||||
for (int i = 0; i < wiener_win; ++i) {
|
||||
for (int j = 0; j < wiener_win; ++j) {
|
||||
int tr_idx = j * wiener_win + i;
|
||||
*M++ += (int64_t)(M_trn[tr_idx] / divider);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function computes two matrices: the cross-correlation between the src
|
||||
// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
|
||||
//
|
||||
// M is of size 7 * 7. It needs to be filled such that multiplying one element
|
||||
// from src with each element of a row of the wiener window will fill one
|
||||
// column of M. However this is not very convenient in terms of memory
|
||||
// accesses, as it means we do contiguous loads of dgd but strided stores to M.
|
||||
// As a result, we use an intermediate matrix M_trn which is instead filled
|
||||
// such that one row of the wiener window gives one row of M_trn. Once fully
|
||||
// computed, M_trn is then transposed to return M.
|
||||
//
|
||||
// H is of size 49 * 49. It is filled by multiplying every pair of elements of
|
||||
// the wiener window together. Since it is a symmetric matrix, we only compute
|
||||
// the upper triangle, and then copy it down to the lower one. Here we fill it
|
||||
// by taking each different pair of columns, and multiplying all the elements of
|
||||
// the first one with all the elements of the second one, with a special case
|
||||
// when multiplying a column by itself.
|
||||
static INLINE void highbd_compute_stats_win7_sve(
|
||||
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
|
||||
int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
|
||||
const int wiener_win = 7;
|
||||
const int wiener_win2 = wiener_win * wiener_win;
|
||||
|
||||
// Use a predicate to compute the last columns of the block for H.
|
||||
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
|
||||
|
||||
// Use intermediate matrices for H and M to perform the computation, they
|
||||
// will be accumulated into the original H and M at the end.
|
||||
int64_t M_trn[49];
|
||||
memset(M_trn, 0, sizeof(M_trn));
|
||||
|
||||
int64_t H_tmp[49 * 49];
|
||||
memset(H_tmp, 0, sizeof(H_tmp));
|
||||
|
||||
do {
|
||||
// Cross-correlation (M).
|
||||
for (int row = 0; row < wiener_win; row++) {
|
||||
int j = 0;
|
||||
while (j < width) {
|
||||
int16x8_t dgd[7];
|
||||
load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
|
||||
&dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
|
||||
int16x8_t s = vld1q_s16(src_avg + j);
|
||||
|
||||
// Compute all the elements of one row of M.
|
||||
compute_M_one_row_win7(s, dgd, M_trn, row);
|
||||
|
||||
j += 8;
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-covariance (H).
|
||||
int j = 0;
|
||||
while (j < width - 8) {
|
||||
for (int col0 = 0; col0 < wiener_win; col0++) {
|
||||
int16x8_t dgd0[7];
|
||||
load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
|
||||
&dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
|
||||
|
||||
// Perform computation of the first column with itself (28 elements).
|
||||
// For the first column this will fill the upper triangle of the 7x7
|
||||
// matrix at the top left of the H matrix. For the next columns this
|
||||
// will fill the upper triangle of the other 7x7 matrices around H's
|
||||
// diagonal.
|
||||
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
|
||||
|
||||
// All computation next to the matrix diagonal has already been done.
|
||||
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
|
||||
// Load second column and scale based on downsampling factor.
|
||||
int16x8_t dgd1[7];
|
||||
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
|
||||
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
|
||||
|
||||
// Compute all elements from the combination of both columns (49
|
||||
// elements).
|
||||
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
|
||||
}
|
||||
}
|
||||
j += 8;
|
||||
}
|
||||
|
||||
// Process remaining columns using a predicate to discard excess elements.
|
||||
for (int col0 = 0; col0 < wiener_win; col0++) {
|
||||
// Load first column.
|
||||
int16x8_t dgd0[7];
|
||||
dgd0[0] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
|
||||
dgd0[1] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
|
||||
dgd0[2] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
|
||||
dgd0[3] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
|
||||
dgd0[4] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
|
||||
dgd0[5] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
|
||||
dgd0[6] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
|
||||
|
||||
// Perform computation of the first column with itself (28 elements).
|
||||
// For the first column this will fill the upper triangle of the 7x7
|
||||
// matrix at the top left of the H matrix. For the next columns this
|
||||
// will fill the upper triangle of the other 7x7 matrices around H's
|
||||
// diagonal.
|
||||
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
|
||||
|
||||
// All computation next to the matrix diagonal has already been done.
|
||||
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
|
||||
// Load second column and scale based on downsampling factor.
|
||||
int16x8_t dgd1[7];
|
||||
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
|
||||
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
|
||||
|
||||
// Compute all elements from the combination of both columns (49
|
||||
// elements).
|
||||
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
|
||||
}
|
||||
}
|
||||
dgd_avg += dgd_avg_stride;
|
||||
src_avg += src_avg_stride;
|
||||
} while (--height != 0);
|
||||
|
||||
// Transpose M_trn.
|
||||
acc_transpose_M(M, M_trn, 7, bit_depth_divider);
|
||||
|
||||
// Copy upper triangle of H in the lower one.
|
||||
copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
|
||||
}
|
||||
|
||||
// This function computes two matrices: the cross-correlation between the src
|
||||
// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
|
||||
//
|
||||
// M is of size 5 * 5. It needs to be filled such that multiplying one element
|
||||
// from src with each element of a row of the wiener window will fill one
|
||||
// column of M. However this is not very convenient in terms of memory
|
||||
// accesses, as it means we do contiguous loads of dgd but strided stores to M.
|
||||
// As a result, we use an intermediate matrix M_trn which is instead filled
|
||||
// such that one row of the wiener window gives one row of M_trn. Once fully
|
||||
// computed, M_trn is then transposed to return M.
|
||||
//
|
||||
// H is of size 25 * 25. It is filled by multiplying every pair of elements of
|
||||
// the wiener window together. Since it is a symmetric matrix, we only compute
|
||||
// the upper triangle, and then copy it down to the lower one. Here we fill it
|
||||
// by taking each different pair of columns, and multiplying all the elements of
|
||||
// the first one with all the elements of the second one, with a special case
|
||||
// when multiplying a column by itself.
|
||||
static INLINE void highbd_compute_stats_win5_sve(
|
||||
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
|
||||
int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
|
||||
const int wiener_win = 5;
|
||||
const int wiener_win2 = wiener_win * wiener_win;
|
||||
|
||||
// Use a predicate to compute the last columns of the block for H.
|
||||
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
|
||||
|
||||
// Use intermediate matrices for H and M to perform the computation, they
|
||||
// will be accumulated into the original H and M at the end.
|
||||
int64_t M_trn[25];
|
||||
memset(M_trn, 0, sizeof(M_trn));
|
||||
|
||||
int64_t H_tmp[25 * 25];
|
||||
memset(H_tmp, 0, sizeof(H_tmp));
|
||||
|
||||
do {
|
||||
// Cross-correlation (M).
|
||||
for (int row = 0; row < wiener_win; row++) {
|
||||
int j = 0;
|
||||
while (j < width) {
|
||||
int16x8_t dgd[5];
|
||||
load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
|
||||
&dgd[2], &dgd[3], &dgd[4]);
|
||||
int16x8_t s = vld1q_s16(src_avg + j);
|
||||
|
||||
// Compute all the elements of one row of M.
|
||||
compute_M_one_row_win5(s, dgd, M_trn, row);
|
||||
|
||||
j += 8;
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-covariance (H).
|
||||
int j = 0;
|
||||
while (j < width - 8) {
|
||||
for (int col0 = 0; col0 < wiener_win; col0++) {
|
||||
// Load first column.
|
||||
int16x8_t dgd0[5];
|
||||
load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
|
||||
&dgd0[2], &dgd0[3], &dgd0[4]);
|
||||
|
||||
// Perform computation of the first column with itself (15 elements).
|
||||
// For the first column this will fill the upper triangle of the 5x5
|
||||
// matrix at the top left of the H matrix. For the next columns this
|
||||
// will fill the upper triangle of the other 5x5 matrices around H's
|
||||
// diagonal.
|
||||
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
|
||||
|
||||
// All computation next to the matrix diagonal has already been done.
|
||||
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
|
||||
// Load second column and scale based on downsampling factor.
|
||||
int16x8_t dgd1[5];
|
||||
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
|
||||
&dgd1[2], &dgd1[3], &dgd1[4]);
|
||||
|
||||
// Compute all elements from the combination of both columns (25
|
||||
// elements).
|
||||
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
|
||||
}
|
||||
}
|
||||
j += 8;
|
||||
}
|
||||
|
||||
// Process remaining columns using a predicate to discard excess elements.
|
||||
for (int col0 = 0; col0 < wiener_win; col0++) {
|
||||
int16x8_t dgd0[5];
|
||||
dgd0[0] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
|
||||
dgd0[1] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
|
||||
dgd0[2] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
|
||||
dgd0[3] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
|
||||
dgd0[4] = svget_neonq_s16(
|
||||
svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
|
||||
|
||||
// Perform computation of the first column with itself (15 elements).
|
||||
// For the first column this will fill the upper triangle of the 5x5
|
||||
// matrix at the top left of the H matrix. For the next columns this
|
||||
// will fill the upper triangle of the other 5x5 matrices around H's
|
||||
// diagonal.
|
||||
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
|
||||
|
||||
// All computation next to the matrix diagonal has already been done.
|
||||
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
|
||||
// Load second column and scale based on downsampling factor.
|
||||
int16x8_t dgd1[5];
|
||||
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
|
||||
&dgd1[2], &dgd1[3], &dgd1[4]);
|
||||
|
||||
// Compute all elements from the combination of both columns (25
|
||||
// elements).
|
||||
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
|
||||
}
|
||||
}
|
||||
dgd_avg += dgd_avg_stride;
|
||||
src_avg += src_avg_stride;
|
||||
} while (--height != 0);
|
||||
|
||||
// Transpose M_trn.
|
||||
acc_transpose_M(M, M_trn, 5, bit_depth_divider);
|
||||
|
||||
// Copy upper triangle of H in the lower one.
|
||||
copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
|
||||
}
|
||||
|
||||
void av1_compute_stats_highbd_sve(int wiener_win, const uint8_t *dgd8,
|
||||
const uint8_t *src8, int16_t *dgd_avg,
|
||||
int16_t *src_avg, int h_start, int h_end,
|
||||
int v_start, int v_end, int dgd_stride,
|
||||
int src_stride, int64_t *M, int64_t *H,
|
||||
aom_bit_depth_t bit_depth) {
|
||||
assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
|
||||
|
||||
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
|
||||
const int wiener_win2 = wiener_win * wiener_win;
|
||||
const int wiener_halfwin = wiener_win >> 1;
|
||||
const int32_t width = h_end - h_start;
|
||||
const int32_t height = v_end - v_start;
|
||||
|
||||
uint8_t bit_depth_divider = 1;
|
||||
if (bit_depth == AOM_BITS_12)
|
||||
bit_depth_divider = 16;
|
||||
else if (bit_depth == AOM_BITS_10)
|
||||
bit_depth_divider = 4;
|
||||
|
||||
const uint16_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
|
||||
memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
|
||||
memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
|
||||
|
||||
const uint16_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
|
||||
|
||||
// dgd_avg and src_avg have been memset to zero before calling this function
|
||||
// so round up the stride to the next multiple of 8 so that we don't have to
|
||||
// worry about a tail loop when computing M.
|
||||
const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
|
||||
const int src_avg_stride = (width & ~7) + 8;
|
||||
|
||||
// Compute (dgd - avg) and store it in dgd_avg.
|
||||
// The wiener window will slide along the dgd frame, centered on each pixel.
|
||||
// For the top left pixel and all the pixels on the side of the frame this
|
||||
// means half of the window will be outside of the frame. As such the actual
|
||||
// buffer that we need to subtract the avg from will be 2 * wiener_halfwin
|
||||
// wider and 2 * wiener_halfwin higher than the original dgd buffer.
|
||||
const int vert_offset = v_start - wiener_halfwin;
|
||||
const int horiz_offset = h_start - wiener_halfwin;
|
||||
const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
|
||||
compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
|
||||
width + 2 * wiener_halfwin, height + 2 * wiener_halfwin);
|
||||
|
||||
// Compute (src - avg), downsample if necessary and store in src-avg.
|
||||
const uint16_t *src_start = src + h_start + v_start * src_stride;
|
||||
compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
|
||||
height);
|
||||
|
||||
if (wiener_win == WIENER_WIN) {
|
||||
highbd_compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg,
|
||||
src_avg_stride, width, height, M, H,
|
||||
bit_depth_divider);
|
||||
} else {
|
||||
highbd_compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg,
|
||||
src_avg_stride, width, height, M, H,
|
||||
bit_depth_divider);
|
||||
}
|
||||
}
|
|
@ -12,6 +12,7 @@
|
|||
#include <arm_neon.h>
|
||||
|
||||
#include "aom_dsp/txfm_common.h"
|
||||
#include "config/av1_rtcd.h"
|
||||
|
||||
static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
|
||||
int32x4x2_t b0 =
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче