Bug 1899864 - Update libaom to a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 r=media-playback-reviewers,alwu

This patch simply runs the command below
```
./mach vendor media/libaom/moz.yaml --patch-mode=none
```
to update the libaom source.

Differential Revision: https://phabricator.services.mozilla.com/D212162
This commit is contained in:
Chun-Min Chang 2024-05-31 00:33:33 +00:00
Родитель 26cb5d7483
Коммит c3dcb83cf6
160 изменённых файлов: 8414 добавлений и 2782 удалений

Просмотреть файл

@ -532,6 +532,12 @@ void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_
void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
#define av1_resize_and_extend_frame av1_resize_and_extend_frame_c
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
#define av1_resize_horz_dir av1_resize_horz_dir_c
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define av1_resize_vert_dir av1_resize_vert_dir_c
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
#define av1_round_shift_array av1_round_shift_array_c
@ -624,9 +630,6 @@ cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_c
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define resize_vert_dir resize_vert_dir_c
void av1_rtcd(void);
#include "config/aom_config.h"

Просмотреть файл

@ -221,7 +221,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
RTCD_EXTERN void (*av1_compute_stats_highbd)(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@ -687,6 +688,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
#define av1_resize_horz_dir av1_resize_horz_dir_c
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define av1_resize_vert_dir av1_resize_vert_dir_c
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -813,9 +820,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define resize_vert_dir resize_vert_dir_c
void av1_rtcd(void);
#include "config/aom_config.h"
@ -870,6 +874,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) av1_compute_stats = av1_compute_stats_neon;
av1_compute_stats_highbd = av1_compute_stats_highbd_c;
if (flags & HAS_NEON) av1_compute_stats_highbd = av1_compute_stats_highbd_neon;
av1_convolve_2d_scale = av1_convolve_2d_scale_c;
if (flags & HAS_NEON) av1_convolve_2d_scale = av1_convolve_2d_scale_neon;
av1_convolve_2d_sr = av1_convolve_2d_sr_c;
if (flags & HAS_NEON) av1_convolve_2d_sr = av1_convolve_2d_sr_neon;
av1_convolve_2d_sr_intrabc = av1_convolve_2d_sr_intrabc_c;

Просмотреть файл

@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_rtcd(void);
#ifdef RTCD_C
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
av1_resize_horz_dir = av1_resize_horz_dir_c;
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
av1_resize_vert_dir = av1_resize_vert_dir_c;
if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
av1_round_shift_array = av1_round_shift_array_c;
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
resize_vert_dir = resize_vert_dir_c;
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
}
#endif

Просмотреть файл

@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_rtcd(void);
#ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
av1_resize_horz_dir = av1_resize_horz_dir_c;
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
av1_round_shift_array = av1_round_shift_array_c;
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
resize_vert_dir = resize_vert_dir_c;
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
}
#endif

Просмотреть файл

@ -207,7 +207,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
#define av1_compute_stats_highbd av1_compute_stats_highbd_neon
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_scale av1_convolve_2d_scale_c
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
#define av1_convolve_2d_scale av1_convolve_2d_scale_neon
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@ -234,7 +235,9 @@ void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
#define av1_convolve_y_sr av1_convolve_y_sr_neon
void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
@ -682,6 +685,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
#define av1_resize_and_extend_frame av1_resize_and_extend_frame_neon
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
#define av1_resize_horz_dir av1_resize_horz_dir_c
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define av1_resize_vert_dir av1_resize_vert_dir_c
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
#define av1_round_shift_array av1_round_shift_array_neon
@ -807,9 +816,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
#define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_neon
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
#define resize_vert_dir resize_vert_dir_c
void av1_rtcd(void);
#include "config/aom_config.h"
@ -830,6 +836,9 @@ static void setup_rtcd_internal(void)
av1_convolve_x_sr = av1_convolve_x_sr_neon;
if (flags & HAS_NEON_DOTPROD) av1_convolve_x_sr = av1_convolve_x_sr_neon_dotprod;
if (flags & HAS_NEON_I8MM) av1_convolve_x_sr = av1_convolve_x_sr_neon_i8mm;
av1_convolve_y_sr = av1_convolve_y_sr_neon;
if (flags & HAS_NEON_DOTPROD) av1_convolve_y_sr = av1_convolve_y_sr_neon_dotprod;
if (flags & HAS_NEON_I8MM) av1_convolve_y_sr = av1_convolve_y_sr_neon_i8mm;
av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon;
if (flags & HAS_NEON_DOTPROD) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_dotprod;
if (flags & HAS_NEON_I8MM) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_i8mm;

Просмотреть файл

@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_rtcd(void);
#ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
av1_resize_horz_dir = av1_resize_horz_dir_c;
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
av1_round_shift_array = av1_round_shift_array_c;
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
resize_vert_dir = resize_vert_dir_c;
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
}
#endif

Просмотреть файл

@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_rtcd(void);
#ifdef RTCD_C
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
av1_resize_horz_dir = av1_resize_horz_dir_c;
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
av1_resize_vert_dir = av1_resize_vert_dir_c;
if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
av1_round_shift_array = av1_round_shift_array_c;
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
resize_vert_dir = resize_vert_dir_c;
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
}
#endif

Просмотреть файл

@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_round_shift_array_c(int32_t *arr, int size, int bit);
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
void av1_rtcd(void);
#ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
av1_resize_horz_dir = av1_resize_horz_dir_c;
if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
av1_resize_vert_dir = av1_resize_vert_dir_sse2;
if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
av1_round_shift_array = av1_round_shift_array_c;
if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
resize_vert_dir = resize_vert_dir_c;
if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
}
#endif

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 23c94347d84241c322f3b40daf120047ff4f8d56 (Wed Apr 17 11:05:14 2024 +0000).
release: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 (Wed May 29 23:21:38 2024 +0000).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 23c94347d84241c322f3b40daf120047ff4f8d56
revision: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -117,6 +117,7 @@ files = {
'../../third_party/aom/av1/av1_cx_iface.c',
'../../third_party/aom/av1/av1_dx_iface.c',
'../../third_party/aom/av1/common/alloccommon.c',
'../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
'../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
'../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
'../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
@ -184,24 +185,24 @@ files = {
'../../third_party/aom/av1/encoder/aq_complexity.c',
'../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
'../../third_party/aom/av1/encoder/aq_variance.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
'../../third_party/aom/av1/encoder/arm/cnn_neon.c',
'../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/ml_neon.c',
'../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
'../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
'../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
'../../third_party/aom/av1/encoder/av1_noise_estimate.c',
@ -394,6 +395,7 @@ files = {
'../../third_party/aom/av1/av1_cx_iface.c',
'../../third_party/aom/av1/av1_dx_iface.c',
'../../third_party/aom/av1/common/alloccommon.c',
'../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
'../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
'../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
'../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
@ -466,26 +468,26 @@ files = {
'../../third_party/aom/av1/encoder/aq_complexity.c',
'../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
'../../third_party/aom/av1/encoder/aq_variance.c',
'../../third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c',
'../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
'../../third_party/aom/av1/encoder/arm/cnn_neon.c',
'../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
'../../third_party/aom/av1/encoder/arm/hash_arm_crc32.c',
'../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
'../../third_party/aom/av1/encoder/arm/ml_neon.c',
'../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
'../../third_party/aom/av1/encoder/arm/quantize_neon.c',
'../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
'../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
'../../third_party/aom/av1/encoder/arm/temporal_filter_neon_dotprod.c',
'../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
'../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
'../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
'../../third_party/aom/av1/encoder/av1_noise_estimate.c',
@ -811,7 +813,6 @@ files = {
'../../third_party/aom/aom_dsp/variance.c',
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
@ -969,6 +970,7 @@ files = {
'../../third_party/aom/av1/common/x86/reconinter_sse4.c',
'../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
'../../third_party/aom/av1/common/x86/resize_avx2.c',
'../../third_party/aom/av1/common/x86/resize_sse2.c',
'../../third_party/aom/av1/common/x86/resize_ssse3.c',
'../../third_party/aom/av1/common/x86/selfguided_avx2.c',
'../../third_party/aom/av1/common/x86/selfguided_sse4.c',
@ -1162,7 +1164,6 @@ files = {
'../../third_party/aom/aom_dsp/variance.c',
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
'../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
'../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
@ -1322,6 +1323,7 @@ files = {
'../../third_party/aom/av1/common/x86/reconinter_sse4.c',
'../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
'../../third_party/aom/av1/common/x86/resize_avx2.c',
'../../third_party/aom/av1/common/x86/resize_sse2.c',
'../../third_party/aom/av1/common/x86/resize_ssse3.c',
'../../third_party/aom/av1/common/x86/selfguided_avx2.c',
'../../third_party/aom/av1/common/x86/selfguided_sse4.c',

1
third_party/aom/.mailmap поставляемый
Просмотреть файл

@ -40,6 +40,7 @@ Iole Moccagatta <iole.moccagatta@gmail.com>
Jacky Chen <jackychen@google.com>
James Zern <jzern@google.com> <jzern@google.cOm>
Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
Jian Zhou <zhoujian@fb.com> <zhoujian@google.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>

5
third_party/aom/AUTHORS поставляемый
Просмотреть файл

@ -51,6 +51,7 @@ Cyril Concolato <cconcolato@netflix.com>
Dake He <dkhe@google.com>
Damon Shen <yjshen@google.com>
Dandan Ding <vickyddding@gmail.com>
Daniel Cheng <dcheng@chromium.org>
Daniele Castagna <dcastagna@chromium.org>
Daniel Kang <ddkang@google.com>
Daniel Max Valenzuela <daniel.vt@samsung.com>
@ -94,6 +95,7 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
Hamsalekha S <hamsalekha.s@ittiam.com>
Hangyu Kuang <hkuang@google.com>
Hanno Böck <hanno@hboeck.de>
Hari Limaye <hari.limaye@arm.com>
Harish Mahendrakar <harish.mahendrakar@ittiam.com>
Henrik Lundin <hlundin@google.com>
Hien Ho <hienho@google.com>
@ -124,7 +126,7 @@ Jeff Muizelaar <jmuizelaar@mozilla.com>
Jeff Petkau <jpet@chromium.org>
Jerome Jiang <jianj@google.com>
Jia Jia <jia.jia@linaro.org>
Jian Zhou <zhoujian@google.com>
Jian Zhou <zhoujian@fb.com>
Jim Bankoski <jimbankoski@google.com>
Jingning Han <jingning@google.com>
Joe Young <joeyoung@google.com>
@ -216,6 +218,7 @@ Peter Boström <pbos@google.com>
Peter de Rivaz <peter.derivaz@gmail.com>
Peter Kasting <pkasting@chromium.org>
Philip Jägenstedt <philipj@opera.com>
Philippe Antoine <p.antoine@catenacyber.fr>
Priit Laes <plaes@plaes.org>
Qiu Jianlin <jianlin.qiu@intel.com>
Rachel Barker <rachelbarker@google.com>

88
third_party/aom/CHANGELOG поставляемый
Просмотреть файл

@ -1,3 +1,91 @@
2024-04-09 v3.9.0
This release includes new codec interfaces, compression efficiency and
perceptual improvements, speedup for RTC for both video and screen content,
and many bug fixes. This release is ABI compatible with the previous release.
- New Features
* New codec control
* AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to
only drop spatial layers or the whole superframe.
* Active Map is fixed and tested for RTC.
* CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom
decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when
both are disabled.
* libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4.
- Compression Efficiency Improvements
* RTC encoding improvements
* 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate
gain on scrolling content.
- Perceptual Quality Improvements
* For RTC screen content
* Reduced color artifacts for RTC screen content
* Visual quality improved for scene changes for SVC with quality layers.
* Removed visual artifacts for speed 11
- Speedups:
* RTC Speed 11: aggressive speedup setting added for video mode,
resolutions <= VGA: ~30% faster than speed 10.
* 5-9% speed up for high bit-depth encoding with good mode on Arm, half of
which comes from SVE/SVE2 optimizations.
- Other improvements
* Further improvements to global motion estimation.
* Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm.
* Remove unneeded SIMD functions, saving >100 KiB from binary size.
* Cleaned up and improved pattern_search.
* Added end-to-end c vs SIMD bit-exactness test.
* Added config flag to calc psnr using libvmaf peak: use a slightly
different peak value for PSNR (1020 and 2040 for 10- and 12-bit)
- Bug Fixes
* Fuzzing bug fixes
* b/329485898 Null-dereference WRITE in av1_cdef_frame_mt
* b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16
* b/329813868 Ill in av1_cdef_frame_mt
* chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row
* b/330014723 Null-dereference WRITE in
cdef_copy_rect8_16bit_to_16bit_avx2
* b/310455204 Null-dereference WRITE in prepare_enc_workers
* b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2
* oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <=
(pbi->output_frame_width_in_tiles_minus_1 + 1)
* oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w
* oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h
* oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize
* oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in
od_ec_decode_bool_q15
* oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init
* oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in
od_ec_dec_normalize
* oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in
get_ls_tile_buffers
* libaom library
* aomedia:3510 Large value of duration could cause encoder overflow
* chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx
in Win ARM64 builds
* aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after
59c592bb8
* aomedia:3531 Exception encountered with PSNR calculation
* aomedia:3541 Can not compile correctly by CYGWIN
* chromium:41482688 heap-buffer-overflow write in vpx_img_read()
(tools_common.c) with VPX_IMG_FMT_NV12
* aomedia:3521 Assertion failures on Arm in CNNTest.* in
av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and
av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon
* aomedia:3486 C vs NEON mismatch in AV1 encoder
* aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon()
* aomedia:3276 Significant progress on ensuring all allocations are
checked
* aomedia:3491 heap-buffer-overflow encoding frames of size 256x256,
512x512 in good quality usage mode using 4 threads
* aomedia:3322 PSNR number discrepancy
* aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni
* aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on
aom/av1/encoder/motion_search_facade.c
* aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case
2024-03-08 v3.8.2
This release includes several bug fixes. This release is ABI
compatible with the last release. See

6
third_party/aom/CMakeLists.txt поставляемый
Просмотреть файл

@ -58,9 +58,9 @@ endif()
# passed to libtool.
#
# We set SO_FILE_VERSION = [c-a].a.r
set(LT_CURRENT 11)
set(LT_REVISION 2)
set(LT_AGE 8)
set(LT_CURRENT 12)
set(LT_REVISION 0)
set(LT_AGE 9)
math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
unset(LT_CURRENT)

1
third_party/aom/aom/aom_encoder.h поставляемый
Просмотреть файл

@ -637,6 +637,7 @@ typedef struct aom_codec_enc_cfg {
/*!\brief Target data rate
*
* Target bitrate to use for this stream, in kilobits per second.
* Max allowed value is 2000000
*/
unsigned int rc_target_bitrate;

4
third_party/aom/aom/src/aom_image.c поставляемый
Просмотреть файл

@ -182,7 +182,9 @@ static aom_image_t *img_alloc_helper(
/* Default viewport to entire image. (This aom_img_set_rect call always
* succeeds.) */
aom_img_set_rect(img, 0, 0, d_w, d_h, border);
int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border);
assert(ret == 0);
(void)ret;
return img;
fail:

1
third_party/aom/aom_dsp/aom_dsp.cmake поставляемый
Просмотреть файл

@ -58,7 +58,6 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2
list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"

Просмотреть файл

@ -20,6 +20,7 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/aom_convolve8_neon.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
@ -231,29 +232,6 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
}
}
static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t filter) {
int16x4_t sum = vmul_lane_s16(s0, filter, 0);
sum = vmla_lane_s16(sum, s1, filter, 1);
sum = vmla_lane_s16(sum, s2, filter, 2);
sum = vmla_lane_s16(sum, s3, filter, 3);
return sum;
}
static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t filter) {
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
sum = vmlaq_lane_s16(sum, s1, filter, 1);
sum = vmlaq_lane_s16(sum, s2, filter, 2);
sum = vmlaq_lane_s16(sum, s3, filter, 3);
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
@ -265,26 +243,20 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
if (w == 4) {
do {
int16x8_t t0 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride)));
int16x8_t t1 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride)));
uint8x8_t t01[4];
int16x4_t s0[4], s1[4];
s0[0] = vget_low_s16(t0);
s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
s1[0] = vget_low_s16(t1);
s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
int16x8_t s01[4];
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
// We halved the filter values so -1 from right shift.
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
@ -298,37 +270,27 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
const uint8_t *s = src;
uint8_t *d = dst;
int16x8_t t0 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
int16x8_t t1 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
s += 8;
do {
int16x8_t t2 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
int16x8_t t3 =
vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
uint8x8_t t0[4], t1[4];
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
int16x8_t s0[4], s1[4];
s0[0] = t0;
s0[1] = vextq_s16(t0, t2, 1);
s0[2] = vextq_s16(t0, t2, 2);
s0[3] = vextq_s16(t0, t2, 3);
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
s1[0] = t1;
s1[1] = vextq_s16(t1, t3, 1);
s1[2] = vextq_s16(t1, t3, 2);
s1[3] = vextq_s16(t1, t3, 3);
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
store_u8_8x2(d, dst_stride, d0, d1);
t0 = t2;
t1 = t3;
s += 8;
d += 8;
width -= 8;
@ -354,7 +316,12 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= ((SUBPEL_TAPS / 2) - 1);
if (get_filter_taps_convolve8(filter_x) <= 4) {
int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
h);
} else if (filter_taps == 4) {
convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
h);
} else {
@ -362,22 +329,13 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_y, int w,
int h) {
const int16x8_t filter = vld1q_s16(filter_y);
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
if (w == 4) {
uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@ -472,3 +430,30 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
int filter_taps = get_filter_taps_convolve8(filter_y);
if (filter_taps == 2) {
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else if (filter_taps == 4) {
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else {
convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h);
}
}

285
third_party/aom/aom_dsp/arm/aom_convolve8_neon.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,285 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
#include <arm_neon.h>
#include "config/aom_config.h"
#include "aom_dsp/arm/mem_neon.h"
static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, int w,
int h) {
// Bilinear filter values are all positive.
const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]);
const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]);
if (w == 4) {
do {
uint8x8_t s0 =
load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride);
uint8x8_t s1 =
load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride);
uint8x8_t s2 =
load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride);
uint8x8_t s3 =
load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride);
uint16x8_t sum0 = vmull_u8(s0, f0);
sum0 = vmlal_u8(sum0, s1, f1);
uint16x8_t sum1 = vmull_u8(s2, f0);
sum1 = vmlal_u8(sum1, s3, f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else if (w == 8) {
do {
uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0);
uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1);
uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0);
uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1);
uint16x8_t sum0 = vmull_u8(s0, f0);
sum0 = vmlal_u8(sum0, s1, f1);
uint16x8_t sum1 = vmull_u8(s2, f0);
sum1 = vmlal_u8(sum1, s3, f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
vst1_u8(dst + 0 * dst_stride, d0);
vst1_u8(dst + 1 * dst_stride, d1);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h > 0);
} else {
do {
int width = w;
const uint8_t *s = src;
uint8_t *d = dst;
do {
uint8x16_t s0 = vld1q_u8(s + 0);
uint8x16_t s1 = vld1q_u8(s + 1);
uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
vst1q_u8(d, vcombine_u8(d0, d1));
s += 16;
d += 16;
width -= 16;
} while (width != 0);
src += src_stride;
dst += dst_stride;
} while (--h > 0);
}
}
static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t filter) {
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
sum = vmlaq_lane_s16(sum, s1, filter, 1);
sum = vmlaq_lane_s16(sum, s2, filter, 2);
sum = vmlaq_lane_s16(sum, s3, filter, 3);
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_y, int w,
int h) {
// All filter values are even, halve to reduce intermediate precision
// requirements.
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
if (w == 4) {
uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
src += 2 * src_stride;
do {
uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter);
uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
s01 = s45;
s12 = s56;
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else {
do {
uint8x8_t t0, t1, t2;
load_u8_8x3(src, src_stride, &t0, &t1, &t2);
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int height = h;
const uint8_t *s = src + 3 * src_stride;
uint8_t *d = dst;
do {
uint8x8_t t3;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter);
uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter);
uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter);
uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src += 8;
dst += 8;
w -= 8;
} while (w != 0);
}
}
static INLINE void convolve8_vert_2tap_neon(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_y, int w,
int h) {
// Bilinear filter values are all positive.
uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]);
uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]);
if (w == 4) {
do {
uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
uint16x8_t sum0 = vmull_u8(s0, f0);
sum0 = vmlal_u8(sum0, s1, f1);
uint16x8_t sum1 = vmull_u8(s2, f0);
sum1 = vmlal_u8(sum1, s3, f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else if (w == 8) {
do {
uint8x8_t s0, s1, s2;
load_u8_8x3(src, src_stride, &s0, &s1, &s2);
uint16x8_t sum0 = vmull_u8(s0, f0);
sum0 = vmlal_u8(sum0, s1, f1);
uint16x8_t sum1 = vmull_u8(s1, f0);
sum1 = vmlal_u8(sum1, s2, f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
vst1_u8(dst + 0 * dst_stride, d0);
vst1_u8(dst + 1 * dst_stride, d1);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h > 0);
} else {
do {
int width = w;
const uint8_t *s = src;
uint8_t *d = dst;
do {
uint8x16_t s0 = vld1q_u8(s + 0 * src_stride);
uint8x16_t s1 = vld1q_u8(s + 1 * src_stride);
uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
vst1q_u8(d, vcombine_u8(d0, d1));
s += 16;
d += 16;
width -= 16;
} while (width != 0);
src += src_stride;
dst += dst_stride;
} while (--h > 0);
}
}
#endif // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_

Просмотреть файл

@ -20,6 +20,8 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/aom_convolve8_neon.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
@ -93,22 +95,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
return vqrshrun_n_s16(sum, FILTER_BITS);
}
void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
static INLINE void convolve8_horiz_8tap_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)x_step_q4;
(void)filter_y;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1);
if (w == 4) {
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
do {
@ -158,6 +149,141 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
}
}
static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
const int8x8_t filters,
const uint8x16_t permute_tbl) {
// Transform sample range to [-128, 127] for 8-bit signed dot product.
int8x16_t samples_128 =
vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
// Accumulate into 128 * FILTER_WEIGHT to account for range transform.
// (Divide by 2 since we halved the filter values.)
int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
// Further narrowing and packing is performed by the caller.
return vmovn_s32(sum);
}
static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
const int8x8_t filters,
const uint8x16x2_t permute_tbl) {
// Transform sample range to [-128, 127] for 8-bit signed dot product.
int8x16_t samples_128 =
vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
// Accumulate into 128 * FILTER_WEIGHT to account for range transform.
// (Divide by 2 since we halved the filter values.)
int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
// First 4 output values.
int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
// Second 4 output values.
int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
// Narrow and re-pack.
int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static INLINE void convolve8_horiz_4tap_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
const int16x4_t x_filter = vld1_s16(filter_x + 2);
// All 4-tap and bilinear filter values are even, so halve them to reduce
// intermediate precision requirements.
const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
if (width == 4) {
const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
do {
uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
// We halved the filter values so -1 from right shift.
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
src += 4 * src_stride;
dst += 4 * dst_stride;
height -= 4;
} while (height > 0);
} else {
const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
do {
const uint8_t *s = src;
uint8_t *d = dst;
int w = width;
do {
uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
w -= 8;
} while (w != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
height -= 4;
} while (height > 0);
}
}
void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)x_step_q4;
(void)filter_y;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1);
int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
h);
} else if (filter_taps == 4) {
convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
filter_x, w, h);
} else {
convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride,
filter_x, w, h);
}
}
static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
int8x8_t a3, int8x16_t *b) {
// Transpose 8-bit elements and concatenate result rows as follows:
@ -244,24 +370,13 @@ static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
return vqrshrun_n_s16(sum, FILTER_BITS);
}
void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
static INLINE void convolve8_vert_8tap_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
int8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
if (w == 4) {
uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@ -410,3 +525,31 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
int filter_taps = get_filter_taps_convolve8(filter_y);
if (filter_taps == 2) {
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else if (filter_taps == 4) {
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else {
convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y,
w, h);
}
}

Просмотреть файл

@ -19,6 +19,8 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/aom_convolve8_neon.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
@ -80,22 +82,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
return vqrshrun_n_s16(sum, FILTER_BITS);
}
void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
static INLINE void convolve8_horiz_8tap_neon_i8mm(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)x_step_q4;
(void)filter_y;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1);
if (w == 4) {
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
do {
@ -145,6 +136,128 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
}
}
static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
const int8x8_t filters,
const uint8x16_t permute_tbl) {
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
int32x4_t sum =
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
// Further narrowing and packing is performed by the caller.
return vmovn_s32(sum);
}
static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
const int8x8_t filters,
const uint8x16x2_t permute_tbl) {
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
vqtbl1q_u8(samples, permute_tbl.val[1]) };
// First 4 output values.
int32x4_t sum0 =
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
// Second 4 output values.
int32x4_t sum1 =
vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
// Narrow and re-pack.
int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static INLINE void convolve8_horiz_4tap_neon_i8mm(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
const int16x4_t x_filter = vld1_s16(filter_x + 2);
// All 4-tap and bilinear filter values are even, so halve them to reduce
// intermediate precision requirements.
const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
if (width == 4) {
const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
do {
uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl);
int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl);
int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl);
int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl);
// We halved the filter values so -1 from right shift.
uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
src += 4 * src_stride;
dst += 4 * dst_stride;
height -= 4;
} while (height > 0);
} else {
const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
do {
int w = width;
const uint8_t *s = src;
uint8_t *d = dst;
do {
uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl);
uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl);
uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl);
uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
w -= 8;
} while (w != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
height -= 4;
} while (height > 0);
}
}
void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)x_step_q4;
(void)filter_y;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1);
int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
h);
} else if (filter_taps == 4) {
convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
filter_x, w, h);
} else {
convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
w, h);
}
}
static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
uint8x8_t a2, uint8x8_t a3,
uint8x16_t *b) {
@ -227,24 +340,13 @@ static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
return vqrshrun_n_s16(sum, FILTER_BITS);
}
void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
static INLINE void convolve8_vert_8tap_neon_i8mm(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
uint8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
if (w == 4) {
uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@ -365,3 +467,31 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)filter_x;
(void)x_step_q4;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
int filter_taps = get_filter_taps_convolve8(filter_y);
if (filter_taps == 2) {
convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else if (filter_taps == 4) {
convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
filter_y, w, h);
} else {
convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
h);
}
}

Просмотреть файл

@ -20,8 +20,9 @@
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/blend.h"
uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
uint16x8_t round_offset) {
static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a,
uint16x8_t b,
uint16x8_t round_offset) {
const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));

3
third_party/aom/aom_dsp/arm/fwd_txfm_neon.c поставляемый
Просмотреть файл

@ -12,6 +12,7 @@
#include <arm_neon.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/arm/mem_neon.h"
@ -115,6 +116,7 @@ void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
vst1q_s16(final_output + 1 * 8, out_23);
}
#if CONFIG_INTERNAL_STATS
void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
// stage 1
int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@ -302,3 +304,4 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
vst1q_s16(&final_output[7 * 8], input_7);
}
}
#endif // CONFIG_INTERNAL_STATS

Просмотреть файл

@ -19,199 +19,208 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/highbd_convolve8_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
static INLINE int32x4_t highbd_convolve8_4_s32(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
const int16x4_t y_filter_lo = vget_low_s16(y_filter);
const int16x4_t y_filter_hi = vget_high_s16(y_filter);
static INLINE uint16x4_t
highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
const int16x8_t filter, const uint16x4_t max) {
const int16x4_t filter_lo = vget_low_s16(filter);
const int16x4_t filter_hi = vget_high_s16(filter);
int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0);
sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
return sum;
uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
return vmin_u16(res, max);
}
static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
int32x4_t sum =
highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
const int16x8_t filter, const uint16x8_t max) {
const int16x4_t filter_lo = vget_low_s16(filter);
const int16x4_t filter_hi = vget_high_s16(filter);
return vqrshrun_n_s32(sum, FILTER_BITS);
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3);
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3);
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
vqrshrun_n_s32(sum1, FILTER_BITS));
return vminq_u16(res, max);
}
static INLINE int32x4_t highbd_convolve8_horiz4_s32(
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
const int16x8_t s2 = vextq_s16(s0, s1, 1);
const int16x8_t s3 = vextq_s16(s0, s1, 2);
const int16x8_t s4 = vextq_s16(s0, s1, 3);
const int16x4_t s0_lo = vget_low_s16(s0);
const int16x4_t s1_lo = vget_low_s16(s2);
const int16x4_t s2_lo = vget_low_s16(s3);
const int16x4_t s3_lo = vget_low_s16(s4);
const int16x4_t s4_lo = vget_high_s16(s0);
const int16x4_t s5_lo = vget_high_s16(s2);
const int16x4_t s6_lo = vget_high_s16(s3);
const int16x4_t s7_lo = vget_high_s16(s4);
return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
s7_lo, x_filter_0_7);
}
static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
return vqrshrun_n_s32(sum, FILTER_BITS);
}
static INLINE void highbd_convolve8_8_s32(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
int32x4_t *sum0, int32x4_t *sum1) {
const int16x4_t y_filter_lo = vget_low_s16(y_filter);
const int16x4_t y_filter_hi = vget_high_s16(y_filter);
*sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
*sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
*sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
*sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
}
static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
const int16x8_t s0_hi,
const int16x8_t x_filter_0_7,
int32x4_t *sum0,
int32x4_t *sum1) {
const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
sum1);
}
static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
int32x4_t sum0, sum1;
highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
vqrshrun_n_s32(sum1, FILTER_BITS));
}
static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
int32x4_t sum0;
int32x4_t sum1;
highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
&sum1);
return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
vqrshrun_n_s32(sum1, FILTER_BITS));
}
static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride,
const int16_t *x_filter_ptr,
int x_step_q4, int w, int h, int bd) {
static void highbd_convolve_horiz_8tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
assert(w >= 4 && h >= 4);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
if (w == 4) {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
int16x8_t s0, s1, s2, s3;
load_s16_8x2(s, src_stride, &s0, &s2);
load_s16_8x2(s + 8, src_stride, &s1, &s3);
int16x4_t s0[8], s1[8], s2[8], s3[8];
load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
&s0[4], &s0[5], &s0[6], &s0[7]);
load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
&s1[4], &s1[5], &s1[6], &s1[7]);
load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
&s2[4], &s2[5], &s2[6], &s2[7]);
load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
&s3[4], &s3[5], &s3[6], &s3[7]);
uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
s0[5], s0[6], s0[7], x_filter, max);
uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
s1[5], s1[6], s1[7], x_filter, max);
uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
s2[5], s2[6], s2[7], x_filter, max);
uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
s3[5], s3[6], s3[7], x_filter, max);
uint16x8_t d01 = vcombine_u16(d0, d1);
d01 = vminq_u16(d01, max);
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
s += 2 * src_stride;
d += 2 * dst_stride;
h -= 2;
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
int height = h;
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
int x_q4 = 0;
const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
int16x8_t s0, s2, s4, s6;
load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
src_x += 8;
do {
int16x8_t s1, s3, s5, s7;
load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
int16x8_t s0[8], s1[8], s2[8], s3[8];
load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
&s0[4], &s0[5], &s0[6], &s0[7]);
load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
&s1[4], &s1[5], &s1[6], &s1[7]);
load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
&s2[4], &s2[5], &s2[6], &s2[7]);
load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
&s3[4], &s3[5], &s3[6], &s3[7]);
uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
s0[5], s0[6], s0[7], x_filter, max);
uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
s1[5], s1[6], s1[7], x_filter, max);
uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
s2[5], s2[6], s2[7], x_filter, max);
uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
s3[5], s3[6], s3[7], x_filter, max);
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s1;
s2 = s3;
s4 = s5;
s6 = s7;
src_x += 8;
s += 8;
d += 8;
width -= 8;
} while (width > 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
} while (height > 0);
}
}
static void highbd_convolve_horiz_4tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
assert(w >= 4 && h >= 4);
const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
if (w == 4) {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
int16x4_t s0[4], s1[4], s2[4], s3[4];
load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
uint16x4_t d0 =
highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max);
uint16x4_t d1 =
highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max);
uint16x4_t d2 =
highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max);
uint16x4_t d3 =
highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max);
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
int height = h;
do {
int width = w;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
do {
int16x8_t s0[4], s1[4], s2[4], s3[4];
load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
uint16x8_t d0 =
highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max);
uint16x8_t d1 =
highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max);
uint16x8_t d2 =
highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max);
uint16x8_t d3 =
highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max);
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
width -= 8;
x_q4 += 8 * x_step_q4;
} while (width > 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
@ -236,21 +245,30 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, w, h, bd);
const int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
filter_x, w, h, bd);
} else if (filter_taps == 4) {
highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride,
filter_x, w, h, bd);
} else {
highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride,
filter_x, w, h, bd);
}
}
}
static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride,
const int16_t *y_filter_ptr, int w, int h,
int bd) {
static void highbd_convolve_vert_8tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
assert(w >= 4 && h >= 4);
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
if (w == 4) {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
@ -263,24 +281,15 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
uint16x4_t d0 =
highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
uint16x4_t d1 =
highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
uint16x4_t d2 =
highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
uint16x4_t d3 =
highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
uint16x8_t d01 = vcombine_u16(d0, d1);
uint16x8_t d23 = vcombine_u16(d2, d3);
d01 = vminq_u16(d01, max);
d23 = vminq_u16(d23, max);
vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@ -289,11 +298,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
s4 = s8;
s5 = s9;
s6 = s10;
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
@ -307,19 +319,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
int16x8_t s7, s8, s9, s10;
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
s7, y_filter);
uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
s8, y_filter);
uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
s9, y_filter);
uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
s10, y_filter);
d0 = vminq_u16(d0, max);
d1 = vminq_u16(d1, max);
d2 = vminq_u16(d2, max);
d3 = vminq_u16(d3, max);
uint16x8_t d0 =
highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
uint16x8_t d1 =
highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
uint16x8_t d2 =
highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
uint16x8_t d3 =
highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
@ -330,6 +337,7 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
s4 = s8;
s5 = s9;
s6 = s10;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
@ -357,7 +365,18 @@ void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
bd);
const int filter_taps = get_filter_taps_convolve8(filter_y);
if (filter_taps == 2) {
highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
dst_stride, filter_y, w, h, bd);
} else if (filter_taps == 4) {
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
dst_stride, filter_y, w, h, bd);
} else {
highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y,
w, h, bd);
}
}
}

279
third_party/aom/aom_dsp/arm/highbd_convolve8_neon.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,279 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
#include <arm_neon.h>
#include "config/aom_config.h"
#include "aom_dsp/arm/mem_neon.h"
static INLINE void highbd_convolve8_horiz_2tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
// Bilinear filter values are all positive and multiples of 8. Divide by 8 to
// reduce intermediate precision requirements and allow the use of non
// widening multiply.
const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
if (w == 4) {
do {
uint16x8_t s0 =
load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
uint16x8_t s1 =
load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
uint16x8_t s2 =
load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
uint16x8_t s3 =
load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);
uint16x8_t sum01 = vmulq_u16(s0, f0);
sum01 = vmlaq_u16(sum01, s1, f1);
uint16x8_t sum23 = vmulq_u16(s2, f0);
sum23 = vmlaq_u16(sum23, s3, f1);
// We divided filter taps by 8 so subtract 3 from right shift.
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
sum01 = vminq_u16(sum01, max);
sum23 = vminq_u16(sum23, max);
store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
do {
int width = w;
const uint16_t *s = src_ptr;
uint16_t *d = dst_ptr;
do {
uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);
uint16x8_t sum01 = vmulq_u16(s0, f0);
sum01 = vmlaq_u16(sum01, s1, f1);
uint16x8_t sum23 = vmulq_u16(s2, f0);
sum23 = vmlaq_u16(sum23, s3, f1);
// We divided filter taps by 8 so subtract 3 from right shift.
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
sum01 = vminq_u16(sum01, max);
sum23 = vminq_u16(sum23, max);
vst1q_u16(d + 0 * dst_stride, sum01);
vst1q_u16(d + 1 * dst_stride, sum23);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 2 * src_stride;
dst_ptr += 2 * dst_stride;
h -= 2;
} while (h > 0);
}
}
static INLINE uint16x4_t highbd_convolve4_4(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
int32x4_t sum = vmull_lane_s16(s0, filter, 0);
sum = vmlal_lane_s16(sum, s1, filter, 1);
sum = vmlal_lane_s16(sum, s2, filter, 2);
sum = vmlal_lane_s16(sum, s3, filter, 3);
uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
return vmin_u16(res, max);
}
static INLINE uint16x8_t highbd_convolve4_8(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
vqrshrun_n_s32(sum1, FILTER_BITS));
return vminq_u16(res, max);
}
static INLINE void highbd_convolve8_vert_4tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
assert(w >= 4 && h >= 4);
const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
if (w == 4) {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
int16x4_t s0, s1, s2;
load_s16_4x3(s, src_stride, &s0, &s1, &s2);
s += 3 * src_stride;
do {
int16x4_t s3, s4, s5, s6;
load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max);
uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max);
uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max);
uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max);
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
do {
int height = h;
const int16_t *s = (const int16_t *)src_ptr;
uint16_t *d = dst_ptr;
int16x8_t s0, s1, s2;
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
s += 3 * src_stride;
do {
int16x8_t s3, s4, s5, s6;
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max);
uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max);
uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max);
uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max);
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height > 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
} while (w > 0);
}
}
static INLINE void highbd_convolve8_vert_2tap_neon(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
// Bilinear filter values are all positive and multiples of 8. Divide by 8 to
// reduce intermediate precision requirements and allow the use of non
// widening multiply.
const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
if (w == 4) {
do {
uint16x8_t s0 =
load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
uint16x8_t s1 =
load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
uint16x8_t s2 =
load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
uint16x8_t s3 =
load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
uint16x8_t sum01 = vmulq_u16(s0, f0);
sum01 = vmlaq_u16(sum01, s1, f1);
uint16x8_t sum23 = vmulq_u16(s2, f0);
sum23 = vmlaq_u16(sum23, s3, f1);
// We divided filter taps by 8 so subtract 3 from right shift.
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
sum01 = vminq_u16(sum01, max);
sum23 = vminq_u16(sum23, max);
store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
do {
int width = w;
const uint16_t *s = src_ptr;
uint16_t *d = dst_ptr;
do {
uint16x8_t s0, s1, s2;
load_u16_8x3(s, src_stride, &s0, &s1, &s2);
uint16x8_t sum01 = vmulq_u16(s0, f0);
sum01 = vmlaq_u16(sum01, s1, f1);
uint16x8_t sum23 = vmulq_u16(s1, f0);
sum23 = vmlaq_u16(sum23, s2, f1);
// We divided filter taps by 8 so subtract 3 from right shift.
sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
sum01 = vminq_u16(sum01, max);
sum23 = vminq_u16(sum23, max);
vst1q_u16(d + 0 * dst_stride, sum01);
vst1q_u16(d + 1 * dst_stride, sum23);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 2 * src_stride;
dst_ptr += 2 * dst_stride;
h -= 2;
} while (h > 0);
}
}
#endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_

Просмотреть файл

@ -18,6 +18,7 @@
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/highbd_convolve8_neon.h"
#include "aom_dsp/arm/mem_neon.h"
static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
@ -252,7 +253,12 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
src -= SUBPEL_TAPS / 2 - 1;
if (get_filter_taps_convolve8(filter_x) <= 4) {
const int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
filter_x, width, height, bd);
} else if (filter_taps == 4) {
highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride,
filter_x, width, height, bd);
} else {
@ -534,134 +540,13 @@ static INLINE void highbd_convolve8_vert_8tap_sve(
}
}
static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter,
uint16x4_t max) {
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
return vmin_u16(res, max);
}
static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter,
uint16x8_t max) {
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS),
vqrshrun_n_s32(s4567, FILTER_BITS));
return vminq_u16(res, max);
}
static INLINE void highbd_convolve8_vert_4tap_sve(
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
int bd) {
const int16x8_t y_filter =
vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
uint8x16_t merge_block_tbl[3];
merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
if (width == 4) {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
int16_t *s = (int16_t *)src;
int16x4_t s0, s1, s2;
load_s16_4x3(s, src_stride, &s0, &s1, &s2);
s += 3 * src_stride;
do {
int16x4_t s3, s4, s5, s6;
load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
transpose_concat_4x4(s0, s1, s2, s3, s0123);
transpose_concat_4x4(s1, s2, s3, s4, s1234);
transpose_concat_4x4(s2, s3, s4, s5, s2345);
transpose_concat_4x4(s3, s4, s5, s6, s3456);
uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max);
uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max);
uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max);
uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max);
store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
// Shuffle everything up four rows.
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
dst += 4 * dst_stride;
height -= 4;
} while (height != 0);
} else {
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
do {
int h = height;
int16_t *s = (int16_t *)src;
uint16_t *d = dst;
int16x8_t s0, s1, s2;
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
s += 3 * src_stride;
do {
int16x8_t s3, s4, s5, s6;
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
// This operation combines a conventional transpose and the sample
// permute required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
transpose_concat_8x4(s0, s1, s2, s3, s0123);
transpose_concat_8x4(s1, s2, s3, s4, s1234);
transpose_concat_8x4(s2, s3, s4, s5, s2345);
transpose_concat_8x4(s3, s4, s5, s6, s3456);
uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max);
uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max);
uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max);
uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max);
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
// Shuffle everything up four rows.
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
} while (h != 0);
src += 8;
dst += 8;
width -= 8;
} while (width != 0);
}
}
void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int width, int height, int bd) {
assert(y_step_q4 == 16);
assert(w >= 4 && h >= 4);
assert(width >= 4 && height >= 4);
(void)filter_x;
(void)y_step_q4;
(void)x_step_q4;
@ -671,9 +556,14 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
if (get_filter_taps_convolve8(filter_y) <= 4) {
highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst,
dst_stride, filter_y, width, height, bd);
const int filter_taps = get_filter_taps_convolve8(filter_y);
if (filter_taps == 2) {
highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
dst_stride, filter_y, width, height, bd);
} else if (filter_taps == 4) {
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
dst_stride, filter_y, width, height, bd);
} else {
highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y,
width, height, bd);

Просмотреть файл

@ -1201,7 +1201,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)
// For width 16 and above.
#define HIGHBD_SMOOTH_H_PREDICTOR(W) \
void highbd_smooth_h_##W##xh_neon( \
static void highbd_smooth_h_##W##xh_neon( \
uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
const uint16_t *const left_column, const int height) { \
const uint16_t top_right = top_row[(W)-1]; \
@ -1293,6 +1293,33 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
}
// clang-format off
static const uint8_t kLoadMaxShuffles[] = {
14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
// clang-format on
static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
int shuffle_idx) {
uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
#if AOM_ARCH_AARCH64
return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
#else
uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
#endif
}
static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
ptrdiff_t stride, int bw,
int bh,
@ -1336,13 +1363,26 @@ static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
} else {
int c = 0;
do {
const uint16x8_t a0 = vld1q_u16(&above[base + c]);
const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
const uint16x8_t cmp =
vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
vst1q_u16(dst + c, res);
uint16x8_t a0;
uint16x8_t a1;
if (base + c >= max_base_x) {
a0 = a1 = vdupq_n_u16(above_max);
} else {
if (base + c + 7 >= max_base_x) {
int shuffle_idx = max_base_x - base - c;
a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
} else {
a0 = vld1q_u16(above + base + c);
}
if (base + c + 8 >= max_base_x) {
int shuffle_idx = max_base_x - base - c - 1;
a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
} else {
a1 = vld1q_u16(above + base + c + 1);
}
}
vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
c += 8;
} while (c < bw);
}
@ -2456,13 +2496,29 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \
uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \
const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \
const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
vrshrn_n_u32(val_hi, (shift))); \
*(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \
vdupq_n_u16(left_max)); \
*(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
vrshrn_n_u32(val_hi, (shift))); \
} while (0)
static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
int max_ofs) {
uint16x8_t r0;
uint16x8_t r1;
if (ofs + 7 >= max_ofs) {
int shuffle_idx = max_ofs - ofs;
r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
} else {
r0 = vld1q_u16(left0 + ofs);
}
if (ofs + 8 >= max_ofs) {
int shuffle_idx = max_ofs - ofs - 1;
r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
} else {
r1 = vld1q_u16(left0 + ofs + 1);
}
return (uint16x8x2_t){ { r0, r1 } };
}
static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
ptrdiff_t stride, int bw,
int bh, const uint16_t *left,
@ -2561,34 +2617,30 @@ static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
if (base0 >= max_base_y) {
out[0] = vdupq_n_u16(left_max);
} else {
const uint16x8_t l00 = vld1q_u16(left + base0);
const uint16x8_t l01 = vld1q_u16(left1 + base0);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
shifts0, shifts1, 0, 6);
const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
l0.val[1], shifts0, shifts1, 0, 6);
}
if (base1 >= max_base_y) {
out[1] = vdupq_n_u16(left_max);
} else {
const uint16x8_t l10 = vld1q_u16(left + base1);
const uint16x8_t l11 = vld1q_u16(left1 + base1);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
shifts0, shifts1, 1, 6);
const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
l1.val[1], shifts0, shifts1, 1, 6);
}
if (base2 >= max_base_y) {
out[2] = vdupq_n_u16(left_max);
} else {
const uint16x8_t l20 = vld1q_u16(left + base2);
const uint16x8_t l21 = vld1q_u16(left1 + base2);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
shifts0, shifts1, 2, 6);
const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
l2.val[1], shifts0, shifts1, 2, 6);
}
if (base3 >= max_base_y) {
out[3] = vdupq_n_u16(left_max);
} else {
const uint16x8_t l30 = vld1q_u16(left + base3);
const uint16x8_t l31 = vld1q_u16(left1 + base3);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
shifts0, shifts1, 3, 6);
const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
l3.val[1], shifts0, shifts1, 3, 6);
}
transpose_array_inplace_u16_4x8(out);
for (int r2 = 0; r2 < 4; ++r2) {

Просмотреть файл

@ -14,6 +14,7 @@
#include <string.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/quantize.h"

66
third_party/aom/aom_dsp/arm/intrapred_neon.c поставляемый
Просмотреть файл

@ -15,6 +15,7 @@
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"
@ -1356,6 +1357,41 @@ static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
}
}
// clang-format off
static const uint8_t kLoadMaxShuffles[] = {
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
// clang-format on
static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
int shuffle_idx) {
uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
uint8x16_t src = vld1q_u8(ptr);
#if AOM_ARCH_AARCH64
return vqtbl1q_u8(src, shuffle);
#else
uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
return vcombine_u8(lo, hi);
#endif
}
static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, int dx) {
const int frac_bits = 6;
@ -1369,7 +1405,6 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
int x = dx;
for (int r = 0; r < N; r++, dst += stride) {
@ -1391,12 +1426,24 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
vcreate_u8(0x0F0E0D0C0B0A0908)));
for (int j = 0; j < 64; j += 16) {
int mdif = max_base_x - (base + j);
if (mdif <= 0) {
if (base + j >= max_base_x) {
vst1q_u8(dst + j, a_mbase_x);
} else {
uint8x16_t a0_128 = vld1q_u8(above + base + j);
uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
uint8x16_t a0_128;
uint8x16_t a1_128;
if (base + j + 15 >= max_base_x) {
int shuffle_idx = max_base_x - base - j;
a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
} else {
a0_128 = vld1q_u8(above + base + j);
}
if (base + j + 16 >= max_base_x) {
int shuffle_idx = max_base_x - base - j - 1;
a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
} else {
a1_128 = vld1q_u8(above + base + j + 1);
}
uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
uint16x8_t diff_hi =
vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
@ -1406,13 +1453,8 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
uint8x16_t v_temp =
vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
uint8x16_t mask128 =
vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
vst1q_u8(dst + j, res128);
vst1q_u8(dst + j,
vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
}

67
third_party/aom/aom_dsp/arm/mem_neon.h поставляемый
Просмотреть файл

@ -174,6 +174,16 @@ static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
*s3 = vld1_u8(s);
}
static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2) {
*s0 = vld1_u8(s);
s += p;
*s1 = vld1_u8(s);
s += p;
*s2 = vld1_u8(s);
}
static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
uint16x4_t *const s0, uint16x4_t *const s1,
uint16x4_t *const s2, uint16x4_t *const s3) {
@ -221,6 +231,16 @@ static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
*s1 = vld1q_u16(s);
}
static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *const s0, uint16x8_t *const s1,
uint16x8_t *const s2) {
*s0 = vld1q_u16(s);
s += p;
*s1 = vld1q_u16(s);
s += p;
*s2 = vld1q_u16(s);
}
static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *const s0, uint16x8_t *const s1,
uint16x8_t *const s2, uint16x8_t *const s3) {
@ -634,6 +654,13 @@ static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
vst1q_s16(s, s3);
}
static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
const int16x8_t s0, const int16x8_t s1) {
vst1q_s16(s, s0);
s += dst_stride;
vst1q_s16(s, s1);
}
static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2, uint8x8_t *const s3,
@ -1026,6 +1053,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
*s7 = vld1q_u8(s);
}
static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3,
uint8x16_t *const s4) {
*s0 = vld1q_u8(s);
s += p;
*s1 = vld1q_u8(s);
s += p;
*s2 = vld1q_u8(s);
s += p;
*s3 = vld1q_u8(s);
s += p;
*s4 = vld1q_u8(s);
}
static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3) {
@ -1038,6 +1080,16 @@ static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
*s3 = vld1q_u8(s);
}
static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2) {
*s0 = vld1q_u8(s);
s += p;
*s1 = vld1q_u8(s);
s += p;
*s2 = vld1q_u8(s);
}
static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@ -1228,6 +1280,12 @@ static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
memcpy(dst, &a, 8); \
} while (0)
#define store_s16_4x1_lane(dst, src, lane) \
do { \
int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
memcpy(dst, &a, 8); \
} while (0)
// Store the low 16-bits from a single vector.
static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
store_u8_2x1_lane(dst, src, 0);
@ -1287,9 +1345,18 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
store_u16_4x1_lane(dst, src, 1);
}
// Store two blocks of 64-bits from a single vector.
static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
int16x8_t src) {
store_s16_4x1_lane(dst, src, 0);
dst += dst_stride;
store_s16_4x1_lane(dst, src, 1);
}
#undef store_u8_2x1_lane
#undef store_u8_4x1_lane
#undef store_u16_2x1_lane
#undef store_u16_4x1_lane
#undef store_s16_4x1_lane
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_

1
third_party/aom/aom_dsp/arm/subtract_neon.c поставляемый
Просмотреть файл

@ -12,6 +12,7 @@
#include <arm_neon.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"

61
third_party/aom/aom_dsp/x86/aom_asm_stubs.c поставляемый
Просмотреть файл

@ -1,61 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/x86/convolve.h"
#if HAVE_SSE2
#if CONFIG_AV1_HIGHBITDEPTH
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_x,
// int x_step_q4,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_x,
// int x_step_q4,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
#endif
#endif // HAVE_SSE2

Просмотреть файл

@ -202,14 +202,15 @@
SECTION .text
;void aom_filter_block1d4_v8_sse2
;void aom_highbd_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d4_v8_sse2)
sym(aom_highbd_filter_block1d4_v8_sse2):
@ -272,14 +273,15 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
pop rbp
ret
;void aom_filter_block1d8_v8_sse2
;void aom_highbd_filter_block1d8_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d8_v8_sse2)
sym(aom_highbd_filter_block1d8_v8_sse2):
@ -331,14 +333,15 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
pop rbp
ret
;void aom_filter_block1d16_v8_sse2
;void aom_highbd_filter_block1d16_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d16_v8_sse2)
sym(aom_highbd_filter_block1d16_v8_sse2):
@ -394,14 +397,15 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
pop rbp
ret
;void aom_filter_block1d4_h8_sse2
;void aom_highbd_filter_block1d4_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d4_h8_sse2)
sym(aom_highbd_filter_block1d4_h8_sse2):
@ -469,14 +473,15 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
pop rbp
ret
;void aom_filter_block1d8_h8_sse2
;void aom_highbd_filter_block1d8_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d8_h8_sse2)
sym(aom_highbd_filter_block1d8_h8_sse2):
@ -535,14 +540,15 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
pop rbp
ret
;void aom_filter_block1d16_h8_sse2
;void aom_highbd_filter_block1d16_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
; const uint16_t *src_ptr,
; const ptrdiff_t src_pitch,
; uint16_t *output_ptr,
; ptrdiff_t out_pitch,
; unsigned int output_height,
; const int16_t *filter,
; int bd
;)
globalsym(aom_highbd_filter_block1d16_h8_sse2)
sym(aom_highbd_filter_block1d16_h8_sse2):

Просмотреть файл

@ -15,6 +15,7 @@
#include "aom/aom_integer.h"
#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
#include "aom_dsp/x86/mem_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
@ -171,10 +172,8 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
__m128i s0, s1, u0;
unsigned int avg = 0;
u0 = _mm_setzero_si128();
s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
_mm_cvtsi32_si128(*(const int *)(s + p)));
s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
_mm_cvtsi32_si128(*(const int *)(s + p * 3)));
s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p));
s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3));
s0 = _mm_sad_epu8(s0, u0);
s1 = _mm_sad_epu8(s1, u0);
s0 = _mm_add_epi16(s0, s1);

Просмотреть файл

@ -15,10 +15,9 @@
// -----------------------------------------------------------------------------
void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d4_v4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
__m128i filtersReg;
__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
__m128i srcReg23_lo, srcReg34_lo;
@ -101,10 +100,9 @@ void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
}
}
void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d4_h4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
__m128i filtersReg;
__m128i addFilterReg64;
__m128i secondFilters, thirdFilters;
@ -153,10 +151,9 @@ void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
}
}
void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d8_v4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
__m128i filtersReg;
__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
__m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
@ -262,10 +259,9 @@ void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
}
}
void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d8_h4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
__m128i filtersReg;
__m128i addFilterReg64;
__m128i secondFilters, thirdFilters;
@ -330,22 +326,57 @@ void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
}
}
void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d16_v4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
height, filter, bd);
aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
dst_pitch, height, filter, bd);
}
void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height,
const int16_t *filter, int bd) {
static void aom_highbd_filter_block1d16_h4_sse2(
const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
height, filter, bd);
aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
dst_pitch, height, filter, bd);
}
// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_x,
// int x_step_q4,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_x,
// int x_step_q4,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)

Просмотреть файл

@ -551,7 +551,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
uint32_t *res) {
__m256i u0, u1, u2, u3;
const __m256i mask = yy_set1_64_from_32i(~0);
const __m256i mask = _mm256_set1_epi64x(~0u);
__m128i sad;
// 8 32-bit summation

Просмотреть файл

@ -17,16 +17,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/x86/synonyms.h"
void aom_var_filter_block2d_bil_first_pass_ssse3(
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
void aom_var_filter_block2d_bil_second_pass_ssse3(
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
#include "aom_dsp/x86/variance_impl_ssse3.h"
static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
const __m128i *w, const __m128i *r,

Просмотреть файл

@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <tmmintrin.h>
#include <immintrin.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"

Просмотреть файл

@ -15,6 +15,7 @@
#include <smmintrin.h>
#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
#include "aom_dsp/x86/synonyms.h"
static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
@ -28,7 +29,7 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
assert(IS_POWER_OF_TWO(h));
do {
const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
const __m128i v_p_b = xx_loadl_32(pre + n);
const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));

Просмотреть файл

@ -22,21 +22,12 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/variance_impl_ssse3.h"
////////////////////////////////////////////////////////////////////////////////
// 8 bit
////////////////////////////////////////////////////////////////////////////////
void aom_var_filter_block2d_bil_first_pass_ssse3(
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
void aom_var_filter_block2d_bil_second_pass_ssse3(
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
unsigned int *const sse, int *const sum,

Просмотреть файл

@ -21,7 +21,7 @@ static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
int width, int height) {
uint64_t result;
__m256i v_acc_q = _mm256_setzero_si256();
const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u);
for (int col = 0; col < height; col += 4) {
__m256i v_acc_d = _mm256_setzero_si256();
for (int row = 0; row < width; row += 16) {

Просмотреть файл

@ -84,7 +84,7 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
src += stride << 2;
r += 4;
} while (r < height);
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
__m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
_mm_and_si128(v_acc_q, v_zext_mask_q));
v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
@ -116,7 +116,7 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
int height) {
int r = 0;
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
__m128i v_acc_q = _mm_setzero_si128();
do {
@ -254,7 +254,7 @@ uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
//////////////////////////////////////////////////////////////////////////////
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
__m128i v_acc0_q = _mm_setzero_si128();
__m128i v_acc1_q = _mm_setzero_si128();

38
third_party/aom/aom_dsp/x86/synonyms.h поставляемый
Просмотреть файл

@ -12,7 +12,7 @@
#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
#define AOM_AOM_DSP_X86_SYNONYMS_H_
#include <immintrin.h>
#include <emmintrin.h>
#include <string.h>
#include "config/aom_config.h"
@ -46,23 +46,13 @@ static INLINE __m128i xx_loadu_128(const void *a) {
return _mm_loadu_si128((const __m128i *)a);
}
// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
// manually on older compilers.
#if !defined(__clang__) && __GNUC_MAJOR__ < 9
static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
__m64 hi_, lo_;
memcpy(&hi_, hi, sizeof(hi_));
memcpy(&lo_, lo, sizeof(lo_));
return _mm_set_epi64(hi_, lo_);
}
#else
// Load 64 bits from each of hi and low, and pack into an SSE register
// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
// the strict aliasing rule, this takes a different approach
static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
_mm_loadl_epi64((const __m128i *)hi));
}
#endif
static INLINE void xx_storel_32(void *const a, const __m128i v) {
const int val = _mm_cvtsi128_si32(v);
@ -81,28 +71,6 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
_mm_storeu_si128((__m128i *)a, v);
}
// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
// compilers. The following function is equivalent to _mm_set_epi64x()
// acting on 32-bit integers.
static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
#if defined(_MSC_VER) && _MSC_VER < 1900
return _mm_set_epi32(0, e1, 0, e0);
#else
return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
#endif
}
// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
// compilers. The following function is equivalent to _mm_set1_epi64x()
// acting on a 32-bit integer.
static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
#if defined(_MSC_VER) && _MSC_VER < 1900
return _mm_set_epi32(0, a, 0, a);
#else
return _mm_set1_epi64x((uint32_t)a);
#endif
}
// Fill an SSE register using an interleaved pair of values, ie. set the
// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
// as when a register is stored to / loaded from memory.

26
third_party/aom/aom_dsp/x86/synonyms_avx2.h поставляемый
Просмотреть файл

@ -53,17 +53,6 @@ static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
}
// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
// compilers. The following function is equivalent to _mm256_set1_epi64x()
// acting on a 32-bit integer.
static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
#else
return _mm256_set1_epi64x((uint32_t)a);
#endif
}
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
// therefore define an equivalent function using a different intrinsic.
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
@ -71,26 +60,11 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
// _mm256_loadu2_m128i has been introduced in GCC 10.1
#if !defined(__clang__) && GCC_VERSION < 101000
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return _mm256_set_m128i(mhi, mlo);
}
#else
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return yy_set_m128i(mhi, mlo);
}
#endif
#undef GCC_VERSION
static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));

Просмотреть файл

@ -15,6 +15,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/variance_impl_ssse3.h"
void aom_var_filter_block2d_bil_first_pass_ssse3(
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,

27
third_party/aom/aom_dsp/x86/variance_impl_ssse3.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,27 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
#define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
#include <stdint.h>
void aom_var_filter_block2d_bil_first_pass_ssse3(
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
void aom_var_filter_block2d_bil_second_pass_ssse3(
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
#endif // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_

Просмотреть файл

@ -44,7 +44,7 @@ static int arm_get_cpu_caps(void) {
return flags;
}
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
static int arm_get_cpu_caps(void) {
int flags = 0;

Просмотреть файл

@ -89,7 +89,7 @@ static int arm_get_cpu_caps(void) {
return flags;
}
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
#elif defined(AOM_USE_ANDROID_CPU_FEATURES)
static int arm_get_cpu_caps(void) {
int flags = 0;

2
third_party/aom/aom_ports/aom_ports.cmake поставляемый
Просмотреть файл

@ -18,7 +18,7 @@ list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
"${AOM_ROOT}/aom_ports/emmintrin_compat.h"
"${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
"${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
"${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
"${AOM_ROOT}/aom_ports/sanitizer.h")
list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")

2
third_party/aom/aom_ports/arm_cpudetect.h поставляемый
Просмотреть файл

@ -32,7 +32,7 @@
#endif
#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
#define ANDROID_USE_CPU_FEATURES_LIB 1
#define AOM_USE_ANDROID_CPU_FEATURES 1
// Use getauxval() when targeting (64-bit) Android with API level >= 18.
// getauxval() is supported since Android API level 18 (Android 4.3.)
// First Android version with 64-bit support was Android 5.x (API level 21).

1
third_party/aom/aom_ports/bitops.h поставляемый
Просмотреть файл

@ -15,7 +15,6 @@
#include <assert.h>
#include <stdint.h>
#include "aom_ports/msvc.h"
#include "config/aom_config.h"
#ifdef _MSC_VER

75
third_party/aom/aom_ports/msvc.h поставляемый
Просмотреть файл

@ -1,75 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_PORTS_MSVC_H_
#define AOM_AOM_PORTS_MSVC_H_
#ifdef _MSC_VER
#include "config/aom_config.h"
#if _MSC_VER < 1900 // VS2015 provides snprintf
#define snprintf _snprintf
#endif // _MSC_VER < 1900
#if _MSC_VER < 1800 // VS2013 provides round
#include <math.h>
static INLINE double round(double x) {
if (x < 0)
return ceil(x - 0.5);
else
return floor(x + 0.5);
}
static INLINE float roundf(float x) {
if (x < 0)
return (float)ceil(x - 0.5f);
else
return (float)floor(x + 0.5f);
}
static INLINE long lroundf(float x) {
if (x < 0)
return (long)(x - 0.5f);
else
return (long)(x + 0.5f);
}
#endif // _MSC_VER < 1800
#if HAVE_AVX
#include <immintrin.h>
// Note:
// _mm256_insert_epi16 intrinsics is available from vs2017.
// We define this macro for vs2015 and earlier. The
// intrinsics used here are in vs2015 document:
// https://msdn.microsoft.com/en-us/library/hh977022.aspx
// Input parameters:
// a: __m256i,
// d: int16_t,
// indx: imm8 (0 - 15)
#if _MSC_VER <= 1900
#define _mm256_insert_epi16(a, d, indx) \
_mm256_insertf128_si256( \
a, \
_mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
indx >> 3)
static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
return a.m256i_i32[i & 7];
}
static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
__m256i c = a;
c.m256i_i32[i & 7] = b;
return c;
}
#endif // _MSC_VER <= 1900
#endif // HAVE_AVX
#endif // _MSC_VER
#endif // AOM_AOM_PORTS_MSVC_H_

16
third_party/aom/aom_util/aom_pthread.h поставляемый
Просмотреть файл

@ -36,8 +36,6 @@ typedef HANDLE pthread_t;
typedef int pthread_attr_t;
typedef CRITICAL_SECTION pthread_mutex_t;
#include <errno.h>
#if _WIN32_WINNT < 0x0600
#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
#endif
@ -74,6 +72,20 @@ static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
return 0;
}
static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
size_t *stacksize) {
(void)attr;
(void)stacksize;
return EINVAL;
}
static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
size_t stacksize) {
(void)attr;
(void)stacksize;
return EINVAL;
}
static INLINE int pthread_create(pthread_t *const thread,
const pthread_attr_t *attr,
unsigned int(__stdcall *start)(void *),

6
third_party/aom/aom_util/aom_thread.c поставляемый
Просмотреть файл

@ -156,16 +156,18 @@ static int reset(AVxWorker *const worker) {
// See: https://crbug.com/aomedia/3379
#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
!defined(NDEBUG)
const size_t kMinStackSize = 1024 * 1024;
#else
const size_t kMinStackSize = 256 * 1024;
#endif
size_t stacksize;
if (!pthread_attr_getstacksize(&attr, &stacksize)) {
const size_t kMinStackSize = 1 << 20; // 1 MiB
if (stacksize < kMinStackSize &&
pthread_attr_setstacksize(&attr, kMinStackSize)) {
pthread_attr_destroy(&attr);
goto Error2;
}
}
#endif
pthread_mutex_lock(&worker->impl_->mutex_);
ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
if (ok) worker->status_ = AVX_WORKER_STATUS_OK;

60
third_party/aom/av1/av1.cmake поставляемый
Просмотреть файл

@ -266,6 +266,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
"${AOM_ROOT}/av1/common/x86/resize_sse2.c"
"${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
@ -354,35 +355,36 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h"
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
"${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
"${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c"
"${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
"${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c"
"${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
"${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
"${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c"
"${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c"
"${AOM_ROOT}/av1/encoder/arm/ml_neon.c"
"${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
"${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
"${AOM_ROOT}/av1/encoder/arm/quantize_neon.c"
"${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c"
"${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c"
"${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
"${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
"${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
"${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
"${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
"${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c"
"${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c"
"${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
"${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
"${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")
list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
"${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
@ -414,6 +416,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SVE
"${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c"
"${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
"${AOM_ROOT}/av1/common/arm/convolve_sve2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
"${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
@ -452,7 +457,7 @@ if(CONFIG_AV1_TEMPORAL_DENOISING)
"${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
"${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c")
endif()
if(CONFIG_AV1_HIGHBITDEPTH)
@ -499,9 +504,12 @@ if(CONFIG_AV1_HIGHBITDEPTH)
"${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
"${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
"${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c"
"${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
"${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c")
endif()
if(CONFIG_ACCOUNTING)
@ -527,7 +535,7 @@ if(CONFIG_REALTIME_ONLY)
"${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c")
"${AOM_ROOT}/av1/encoder/arm/cnn_neon.c")
list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/cnn.c"

149
third_party/aom/av1/av1_cx_iface.c поставляемый
Просмотреть файл

@ -674,6 +674,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000);
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
RANGE_CHECK_BOOL(extra_cfg, lossless);
@ -1034,39 +1035,22 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
}
TuneCfg *const tune_cfg = &oxcf->tune_cfg;
FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
TileConfig *const tile_cfg = &oxcf->tile_cfg;
ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
GFConfig *const gf_cfg = &oxcf->gf_cfg;
PartitionCfg *const part_cfg = &oxcf->part_cfg;
IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
QuantizationCfg *const q_cfg = &oxcf->q_cfg;
ColorCfg *const color_cfg = &oxcf->color_cfg;
InputCfg *const input_cfg = &oxcf->input_cfg;
AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
ToolCfg *const tool_cfg = &oxcf->tool_cfg;
const int is_vbr = cfg->rc_end_usage == AOM_VBR;
@ -1610,37 +1594,42 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_OK;
}
static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) {
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
bool is_sb_size_changed = false;
av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
struct aom_internal_error_info *const error = cpi->common.error;
if (setjmp(error->jmp)) {
error->setjmp = 0;
return error->error_code;
}
error->setjmp = 1;
av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
error->setjmp = 0;
}
if (ctx->ppi->cpi_lap != NULL) {
AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
struct aom_internal_error_info *const error = cpi_lap->common.error;
if (setjmp(error->jmp)) {
error->setjmp = 0;
return error->error_code;
}
error->setjmp = 1;
av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
error->setjmp = 0;
}
return AOM_CODEC_OK;
}
static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
const struct av1_extracfg *extra_cfg) {
const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
if (res == AOM_CODEC_OK) {
ctx->extra_cfg = *extra_cfg;
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
bool is_sb_size_changed = false;
av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
struct aom_internal_error_info *const error = cpi->common.error;
if (setjmp(error->jmp)) {
error->setjmp = 0;
return error->error_code;
}
error->setjmp = 1;
av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
error->setjmp = 0;
}
if (ctx->ppi->cpi_lap != NULL) {
AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
struct aom_internal_error_info *const error = cpi_lap->common.error;
if (setjmp(error->jmp)) {
error->setjmp = 0;
return error->error_code;
}
error->setjmp = 1;
av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
error->setjmp = 0;
}
return update_encoder_cfg(ctx);
}
return res;
}
@ -3343,7 +3332,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
if (ppi->cpi->oxcf.pass != 1) {
ppi->total_time_compress_data += cpi->time_compress_data;
ppi->total_recode_hits += cpi->frame_recode_hits;
ppi->total_bytes += cpi->bytes;
ppi->total_bytes += (uint64_t)cpi->bytes;
for (int i = 0; i < MAX_MODES; i++) {
ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
}
@ -3611,11 +3600,23 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
if (mode) {
const int res = av1_set_internal_size(
&ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
mode->h_scaling_mode, mode->v_scaling_mode);
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
AV1EncoderConfig *const oxcf =
ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf;
const int res =
av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params,
mode->h_scaling_mode, mode->v_scaling_mode);
if (res == 0) {
// update_encoder_cfg() is somewhat costly and this control may be called
// multiple times, so update_encoder_cfg() is only called to ensure frame
// and superblock sizes are updated before they're fixed by the first
// encode call.
if (ctx->ppi->seq_params_locked) {
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
return AOM_CODEC_OK;
}
return update_encoder_cfg(ctx);
}
return AOM_CODEC_INVALID_PARAM;
} else {
return AOM_CODEC_INVALID_PARAM;
}
@ -3636,6 +3637,13 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
return AOM_CODEC_INVALID_PARAM;
ctx->ppi->number_spatial_layers = number_spatial_layers;
// update_encoder_cfg() is somewhat costly and this control may be called
// multiple times, so update_encoder_cfg() is only called to ensure frame and
// superblock sizes are updated before they're fixed by the first encode
// call.
if (!ctx->ppi->seq_params_locked) {
return update_encoder_cfg(ctx);
}
return AOM_CODEC_OK;
}
@ -3653,8 +3661,6 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
va_list args) {
AV1_PRIMARY *const ppi = ctx->ppi;
AV1_COMP *const cpi = ppi->cpi;
AV1_COMMON *const cm = &cpi->common;
AV1EncoderConfig *oxcf = &cpi->oxcf;
aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
int64_t target_bandwidth = 0;
ppi->number_spatial_layers = params->number_spatial_layers;
@ -3694,19 +3700,38 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
target_bandwidth += lc->layer_target_bitrate;
}
}
if (cm->current_frame.frame_number == 0) {
if (!cpi->ppi->seq_params_locked) {
SequenceHeader *const seq_params = &ppi->seq_params;
seq_params->operating_points_cnt_minus_1 =
ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
}
if (ppi->seq_params_locked) {
AV1EncoderConfig *const oxcf = &cpi->oxcf;
// Keep ctx->oxcf in sync in case further codec controls are made prior
// to encoding.
ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth =
target_bandwidth;
set_primary_rc_buffer_sizes(oxcf, ppi);
av1_update_layer_context_change_config(cpi, target_bandwidth);
check_reset_rc_flag(cpi);
} else {
// Note av1_init_layer_context() relies on cpi->oxcf. The order of that
// call and the ones in the other half of this block (which
// update_encoder_cfg() transitively makes) is important. So we keep
// ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will
// overwrite cpi->oxcf with ctx->oxcf.
ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth =
target_bandwidth;
SequenceHeader *const seq_params = &ppi->seq_params;
seq_params->operating_points_cnt_minus_1 =
ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
av1_init_layer_context(cpi);
// update_encoder_cfg() is somewhat costly and this control may be called
// multiple times, so update_encoder_cfg() is only called to ensure frame
// and superblock sizes are updated before they're fixed by the first
// encode call.
return update_encoder_cfg(ctx);
}
oxcf->rc_cfg.target_bandwidth = target_bandwidth;
set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
av1_update_layer_context_change_config(cpi, target_bandwidth);
check_reset_rc_flag(cpi);
} else if (!ppi->seq_params_locked) {
// Ensure frame and superblock sizes are updated.
return update_encoder_cfg(ctx);
}
av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
return AOM_CODEC_OK;

702
third_party/aom/av1/common/arm/av1_convolve_scale_neon.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,702 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
static INLINE int16x4_t compound_convolve8_4_v(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
const int32x4_t offset_const) {
const int16x4_t filter_0_3 = vget_low_s16(filter);
const int16x4_t filter_4_7 = vget_high_s16(filter);
int32x4_t sum = offset_const;
sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
}
static INLINE int16x8_t compound_convolve8_8_v(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
const int32x4_t offset_const) {
const int16x4_t filter_0_3 = vget_low_s16(filter);
const int16x4_t filter_4_7 = vget_high_s16(filter);
int32x4_t sum0 = offset_const;
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
int32x4_t sum1 = offset_const;
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
return vcombine_s16(res0, res1);
}
static INLINE void compound_convolve_vert_scale_neon(
const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
// non-rounding shifts - which are generally faster than rounding shifts on
// modern CPUs.
const int32x4_t vert_offset =
vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
int y_qn = subpel_y_qn;
if (w == 4) {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
vst1_u16(dst, vreinterpret_u16_s16(d0));
dst += dst_stride;
y_qn += y_step_qn;
} while (--h != 0);
} else {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int width = w;
uint16_t *d = dst;
do {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
vst1q_u16(d, vreinterpretq_u16_s16(d0));
s += 8;
d += 8;
width -= 8;
} while (width != 0);
dst += dst_stride;
y_qn += y_step_qn;
} while (--h != 0);
}
}
static INLINE void compound_avg_convolve_vert_scale_neon(
const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
int subpel_y_qn, int y_step_qn) {
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
// non-rounding shifts - which are generally faster than rounding shifts
// on modern CPUs.
const int32_t vert_offset_bits =
(1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
// For the averaging code path substract round offset and convolve round.
const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
int y_qn = subpel_y_qn;
if (w == 4) {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
int16x4_t avg = vhadd_s16(dd0, d0);
int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
uint8x8_t d0_u8 = vqrshrun_n_s16(
d0_s16, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
store_u8_4x1(dst8, d0_u8);
dst16 += dst16_stride;
dst8 += dst8_stride;
y_qn += y_step_qn;
} while (--h != 0);
} else {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int width = w;
uint8_t *dst8_ptr = dst8;
uint16_t *dst16_ptr = dst16;
do {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
int16x8_t avg = vhaddq_s16(dd0, d0);
uint8x8_t d0_u8 = vqrshrun_n_s16(
avg, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
vst1_u8(dst8_ptr, d0_u8);
s += 8;
dst8_ptr += 8;
dst16_ptr += 8;
width -= 8;
} while (width != 0);
dst16 += dst16_stride;
dst8 += dst8_stride;
y_qn += y_step_qn;
} while (--h != 0);
}
}
static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
int y_qn = subpel_y_qn;
// A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
// non-rounding shifts - which are generally faster than rounding shifts on
// modern CPUs.
const int32x4_t vert_offset =
vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
// For the weighted averaging code path we have to substract round offset and
// convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
// COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
// additional shift by DIST_PRECISION_BITS is needed in order to merge two
// shift calculations into one.
const int32x4_t dist_wtd_offset = vdupq_n_s32(
(1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
DIST_PRECISION_BITS)) -
(1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
(1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
if (w == 4) {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
int16x4_t d0_s16 = vshrn_n_s32(
dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
DIST_PRECISION_BITS);
uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
store_u8_4x1(dst8, d0_u8);
dst16 += dst16_stride;
dst8 += dst8_stride;
y_qn += y_step_qn;
} while (--h != 0);
} else {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int width = w;
uint8_t *dst8_ptr = dst8;
uint16_t *dst16_ptr = dst16;
do {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
filter, vert_offset);
int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
int32x4_t dst_wtd_avg0 =
vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
int32x4_t dst_wtd_avg1 =
vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
int16x4_t d0_s16_0 = vshrn_n_s32(
dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
DIST_PRECISION_BITS);
int16x4_t d0_s16_1 = vshrn_n_s32(
dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
DIST_PRECISION_BITS);
uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
vst1_u8(dst8_ptr, d0_u8);
s += 8;
dst8_ptr += 8;
dst16_ptr += 8;
width -= 8;
} while (width != 0);
dst16 += dst16_stride;
dst8 += dst8_stride;
y_qn += y_step_qn;
} while (--h != 0);
}
}
static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
const int16x8_t filter,
const int32x4_t offset_const) {
const int16x4_t filter_0_3 = vget_low_s16(filter);
const int16x4_t filter_4_7 = vget_high_s16(filter);
int32x4_t sum = offset_const;
sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
}
static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
const int16x8_t filter,
const int32x4_t offset_const) {
const int16x4_t filter_0_3 = vget_low_s16(filter);
const int16x4_t filter_4_7 = vget_high_s16(filter);
int32x4_t sum0 = offset_const;
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
int32x4_t sum1 = offset_const;
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
return vqmovun_s16(vcombine_s16(res0, res1));
}
static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w,
int h, const int16_t *y_filter,
int subpel_y_qn, int y_step_qn) {
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
// The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
int32x4_t vert_offset =
vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
int y_qn = subpel_y_qn;
if (w == 4) {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
uint8x8_t d =
convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
store_u8_4x1(dst, d);
dst += dst_stride;
y_qn += y_step_qn;
} while (--h != 0);
} else if (w == 8) {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
uint8x8_t d =
convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
vst1_u8(dst, d);
dst += dst_stride;
y_qn += y_step_qn;
} while (--h != 0);
} else {
do {
const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
uint8_t *d = dst;
int width = w;
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
do {
int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
&s5[0], &s6[0], &s7[0]);
load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
&s5[1], &s6[1], &s7[1]);
uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
s6[0], s7[0], filter, vert_offset);
uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
s6[1], s7[1], filter, vert_offset);
vst1q_u8(d, vcombine_u8(d0, d1));
s += 16;
d += 16;
width -= 16;
} while (width != 0);
dst += dst_stride;
y_qn += y_step_qn;
} while (--h != 0);
}
}
static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
const int16x8_t filter,
const int32x4_t horiz_const) {
int16x4_t filter_lo = vget_low_s16(filter);
int16x4_t filter_hi = vget_high_s16(filter);
int32x4_t sum = horiz_const;
sum = vmlal_lane_s16(sum, s0, filter_lo, 0);
sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
return vshrn_n_s32(sum, ROUND0_BITS);
}
static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
const int16x8_t filter,
const int16x8_t horiz_const) {
int16x4_t filter_lo = vget_low_s16(filter);
int16x4_t filter_hi = vget_high_s16(filter);
int16x8_t sum = horiz_const;
sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
int16_t *dst, int dst_stride,
int w, int h,
const int16_t *x_filter,
const int subpel_x_qn,
const int x_step_qn) {
DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
const int bd = 8;
if (w == 4) {
// The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
const int32x4_t horiz_offset =
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
do {
int x_qn = subpel_x_qn;
// Process a 4x4 tile.
for (int r = 0; r < 4; ++r) {
const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
uint8x8_t t0, t1, t2, t3;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
int16x4_t d0 =
convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
vst1_s16(&temp[r * 4], d0);
x_qn += x_step_qn;
}
// Transpose the 4x4 result tile and store.
int16x4_t d0, d1, d2, d3;
load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
dst += 4 * dst_stride;
src += 4 * src_stride;
h -= 4;
} while (h > 0);
} else {
// The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
// The additional -1 is needed because we are halving the filter values.
const int16x8_t horiz_offset =
vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
do {
int x_qn = subpel_x_qn;
int16_t *d = dst;
int width = w;
do {
// Process an 8x8 tile.
for (int r = 0; r < 8; ++r) {
const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
const ptrdiff_t filter_offset =
SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
int16x8_t filter = vld1q_s16(x_filter + filter_offset);
// Filter values are all even so halve them to allow convolution
// kernel computations to stay in 16-bit element types.
filter = vshrq_n_s16(filter, 1);
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
&t3, &t4, &t5, &t6, &t7);
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter,
horiz_offset);
vst1q_s16(&temp[r * 8], d0);
x_qn += x_step_qn;
}
// Transpose the 8x8 result tile and store.
int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
d += 8;
width -= 8;
} while (width != 0);
dst += 8 * dst_stride;
src += 8 * src_stride;
h -= 8;
} while (h > 0);
}
}
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
if (w < 4 || h < 4) {
av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
filter_params_x, filter_params_y, subpel_x_qn,
x_step_qn, subpel_y_qn, y_step_qn, conv_params);
return;
}
// For the interpolation 8-tap filters are used.
assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
DECLARE_ALIGNED(32, int16_t,
im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
int im_stride = MAX_SB_SIZE;
CONV_BUF_TYPE *dst16 = conv_params->dst;
const int dst16_stride = conv_params->dst_stride;
// Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
// lines post both horizontally and vertically.
const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
// Horizontal filter
convolve_horiz_scale_neon(
src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
// Vertical filter
if (UNLIKELY(conv_params->is_compound)) {
if (conv_params->do_average) {
if (conv_params->use_dist_wtd_comp_avg) {
compound_dist_wtd_convolve_vert_scale_neon(
im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
} else {
compound_avg_convolve_vert_scale_neon(
im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
}
} else {
compound_convolve_vert_scale_neon(
im_block, im_stride, dst16, dst16_stride, w, h,
filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
}
} else {
convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
filter_params_y->filter_ptr, subpel_y_qn,
y_step_qn);
}
}

Просмотреть файл

@ -447,7 +447,7 @@ static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
out[7] = step1;
}
void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
assert(!(size % 4));
if (!bit) return;
const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
@ -3661,7 +3661,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
}
row_txfm(cur_a, cur_a, INV_COS_BIT);
av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
if (lr_flip == 1) {
for (int j = 0; j < buf_size_w_div8; ++j) {
flip_buf_ud_neon(&cur_a[j * 8], 8);
@ -3736,8 +3736,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
}
for (int j = 0; j < buf_size_w_div8; ++j) {
col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-shift[1]);
round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
}
if (txfm_size_col >= 16) {
for (int i = 0; i < (txfm_size_col >> 4); i++) {
@ -3814,8 +3813,9 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
}
}
void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int eob) {
static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type,
int eob) {
(void)eob;
TX_SIZE tx_size = TX_4X8;
DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
@ -3879,8 +3879,9 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
}
}
void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int eob) {
static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type,
int eob) {
(void)eob;
TX_SIZE tx_size = TX_8X4;
DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
@ -3944,8 +3945,9 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
}
}
void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int eob) {
static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type, int eob) {
(void)eob;
TX_SIZE tx_size = TX_4X16;
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
@ -4008,8 +4010,9 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
}
}
void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int eob) {
static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type, int eob) {
(void)eob;
TX_SIZE tx_size = TX_16X4;
DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
@ -4112,7 +4115,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
}
row_txfm(cur_a, cur_a, INV_COS_BIT);
av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
if (lr_flip == 1) {
for (int j = 0; j < buf_size_w_div8; ++j) {
flip_buf_ud_neon(&cur_a[j * 8], 8);
@ -4130,8 +4133,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
}
for (int j = 0; j < buf_size_w_div8; ++j) {
col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-shift[1]);
round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
}
if (txfm_size_col >= 16) {

817
third_party/aom/av1/common/arm/convolve_neon.c поставляемый
Просмотреть файл

@ -188,18 +188,95 @@ static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
#endif // AOM_ARCH_AARCH64
}
static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t filter,
const int16x4_t horiz_const) {
int16x4_t sum = horiz_const;
sum = vmla_lane_s16(sum, s0, filter, 0);
sum = vmla_lane_s16(sum, s1, filter, 1);
sum = vmla_lane_s16(sum, s2, filter, 2);
sum = vmla_lane_s16(sum, s3, filter, 3);
int16x8_t horiz_const) {
int16x8_t sum = horiz_const;
sum = vmlaq_lane_s16(sum, s0, filter, 0);
sum = vmlaq_lane_s16(sum, s1, filter, 1);
sum = vmlaq_lane_s16(sum, s2, filter, 2);
sum = vmlaq_lane_s16(sum, s3, filter, 3);
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
// We halved the convolution filter values so - 1 from the right shift.
return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
int src_stride, uint8_t *dst_ptr,
const int dst_stride, int w, int h,
const int16_t *x_filter_ptr) {
// All filter values are even, halve to reduce intermediate precision
// requirements.
const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
// FILTER_BITS - ROUND0_BITS.
// The outermost -1 is needed because we will halve the filter values.
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
if (w == 4) {
do {
uint8x8_t t01[4];
t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
int16x8_t s01[4];
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
uint8x8_t d01 =
convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
src_ptr += 2 * src_stride;
dst_ptr += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else {
do {
int width = w;
const uint8_t *s = src_ptr;
uint8_t *d = dst_ptr;
do {
uint8x8_t t0[4], t1[4];
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
int16x8_t s0[4], s1[4];
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
uint8x8_t d0 =
convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
uint8x8_t d1 =
convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
store_u8_8x2(d, dst_stride, d0, d1);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 2 * src_stride;
dst_ptr += 2 * dst_stride;
h -= 2;
} while (h != 0);
}
}
static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
@ -242,12 +319,20 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
if (filter_params_x->taps > 8) {
int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
if (filter_taps > 8) {
convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
x_filter_ptr);
return;
}
if (filter_taps <= 4) {
convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
x_filter_ptr);
return;
}
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@ -255,149 +340,220 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
// The outermost -1 is needed because we will halve the filter values.
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
if (w <= 4) {
// 4-tap filters are used for blocks having width <= 4.
// Filter values are even, so halve to reduce intermediate precision reqs.
const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
src += 2;
do {
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
uint8x8_t d0 =
convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
store_u8_4x1(dst, d0);
src += src_stride;
dst += dst_stride;
} while (--h != 0);
} else {
// Filter values are even so halve to reduce precision requirements.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
// Filter values are even so halve to reduce precision requirements.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
#if AOM_ARCH_AARCH64
while (h >= 8) {
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
while (h >= 8) {
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
int width = w;
const uint8_t *s = src + 7;
uint8_t *d = dst;
__builtin_prefetch(d + 0 * dst_stride);
__builtin_prefetch(d + 1 * dst_stride);
__builtin_prefetch(d + 2 * dst_stride);
__builtin_prefetch(d + 3 * dst_stride);
__builtin_prefetch(d + 4 * dst_stride);
__builtin_prefetch(d + 5 * dst_stride);
__builtin_prefetch(d + 6 * dst_stride);
__builtin_prefetch(d + 7 * dst_stride);
do {
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
&t14);
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
uint8x8_t d0 =
convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
uint8x8_t d1 =
convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
uint8x8_t d2 =
convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
uint8x8_t d3 =
convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
horiz_const);
uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
horiz_const);
uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
horiz_const);
uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
x_filter, horiz_const);
transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s1 = s9;
s2 = s10;
s3 = s11;
s4 = s12;
s5 = s13;
s6 = s14;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
}
#endif // AOM_ARCH_AARCH64
while (h-- != 0) {
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int width = w;
const uint8_t *s = src + 8;
uint8_t *d = dst;
__builtin_prefetch(d);
do {
uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
uint8x8_t d0 =
convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
vst1_u8(d, d0);
s0 = s8;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += src_stride;
dst += dst_stride;
}
}
static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t filter) {
int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
sum = vmlaq_lane_s16(sum, s1, filter, 1);
sum = vmlaq_lane_s16(sum, s2, filter, 2);
sum = vmlaq_lane_s16(sum, s3, filter, 3);
// We halved the filter values so -1 from right shift.
return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
const int src_stride, uint8_t *dst,
const int dst_stride, int w, int h,
const int16_t *filter_y) {
// All filter values are even, halve to reduce intermediate precision
// requirements.
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
if (w == 4) {
uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
src += 2 * src_stride;
do {
uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
s01 = s45;
s12 = s56;
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else {
do {
uint8x8_t t0, t1, t2;
load_u8_8x3(src, src_stride, &t0, &t1, &t2);
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
int width = w;
const uint8_t *s = src + 7;
int height = h;
const uint8_t *s = src + 3 * src_stride;
uint8_t *d = dst;
__builtin_prefetch(d + 0 * dst_stride);
__builtin_prefetch(d + 1 * dst_stride);
__builtin_prefetch(d + 2 * dst_stride);
__builtin_prefetch(d + 3 * dst_stride);
__builtin_prefetch(d + 4 * dst_stride);
__builtin_prefetch(d + 5 * dst_stride);
__builtin_prefetch(d + 6 * dst_stride);
__builtin_prefetch(d + 7 * dst_stride);
do {
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
uint8x8_t t3;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
&t14);
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const);
uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const);
uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const);
uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const);
uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
horiz_const);
uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
x_filter, horiz_const);
uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
x_filter, horiz_const);
uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
x_filter, horiz_const);
uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s4;
s1 = s5;
s2 = s6;
s0 = s8;
s1 = s9;
s2 = s10;
s3 = s11;
s4 = s12;
s5 = s13;
s6 = s14;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
}
#endif // AOM_ARCH_AARCH64
while (h-- != 0) {
uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int width = w;
const uint8_t *s = src + 8;
uint8_t *d = dst;
__builtin_prefetch(d);
do {
uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const);
vst1_u8(d, d0);
s0 = s8;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += src_stride;
dst += dst_stride;
}
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src += 8;
dst += 8;
w -= 8;
} while (w != 0);
}
}
@ -974,7 +1130,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
const int vert_offset = clamped_y_taps / 2 - 1;
src -= vert_offset * src_stride;
@ -991,7 +1147,10 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
// Filter values are even so halve to reduce precision requirements.
const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
if (y_filter_taps < 8) {
if (y_filter_taps <= 4) {
convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
y_filter_ptr);
} else if (y_filter_taps == 6) {
convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
} else {
convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
@ -1148,18 +1307,122 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
} while (--h != 0);
}
static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t filter,
const int16x4_t horiz_const) {
int16x4_t sum = horiz_const;
sum = vmla_lane_s16(sum, s0, filter, 0);
sum = vmla_lane_s16(sum, s1, filter, 1);
sum = vmla_lane_s16(sum, s2, filter, 2);
sum = vmla_lane_s16(sum, s3, filter, 3);
const int16x8_t horiz_const) {
int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
sum = vmlaq_lane_s16(sum, s1, filter, 1);
sum = vmlaq_lane_s16(sum, s2, filter, 2);
sum = vmlaq_lane_s16(sum, s3, filter, 3);
// We halved the filter values so -1 from right shift.
return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
// We halved the convolution filter values so -1 from the right shift.
return vshr_n_s16(sum, ROUND0_BITS - 1);
static INLINE void convolve_2d_sr_horiz_4tap_neon(
const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
const int bd = 8;
// All filter values are even, halve to reduce intermediate precision
// requirements.
const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// (The extra -1 is needed because we halved the filter values.)
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
if (w == 4) {
do {
uint8x8_t t01[4];
t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
int16x8_t s01[4];
s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
int16x8_t d01 =
convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
store_s16x4_strided_x2(dst, (int)dst_stride, d01);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h > 0);
} else {
do {
int width = w;
const uint8_t *s = src;
int16_t *d = dst;
do {
uint8x8_t t0[4], t1[4];
load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
int16x8_t s0[4];
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
int16x8_t s1[4];
s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
int16x8_t d0 =
convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
int16x8_t d1 =
convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
store_s16_8x2(d, dst_stride, d0, d1);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h > 2);
do {
const uint8_t *s = src;
int16_t *d = dst;
int width = w;
do {
uint8x8_t t0[4];
load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
int16x8_t s0[4];
s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
int16x8_t d0 =
convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
vst1q_s16(d, d0);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += src_stride;
dst += dst_stride;
} while (--h != 0);
}
}
static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
@ -1185,10 +1448,9 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
int16_t *im_block, int im_stride,
int w, int im_h,
const int16_t *x_filter_ptr) {
static INLINE void convolve_2d_sr_horiz_8tap_neon(
const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
int im_h, const int16_t *x_filter_ptr) {
const int bd = 8;
const uint8_t *src_ptr = src;
@ -1196,149 +1458,119 @@ static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
int dst_stride = im_stride;
int height = im_h;
if (w <= 4) {
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// (The extra -1 is needed because we halved the filter values.)
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
// 4-tap filters are used for blocks having width <= 4.
// Filter values are even, so halve to reduce intermediate precision reqs.
const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
src_ptr += 2;
do {
uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
vst1_s16(dst_ptr, d0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--height != 0);
} else {
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// (The extra -1 is needed because we halved the filter values.)
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
// Filter values are even, so halve to reduce intermediate precision reqs.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// (The extra -1 is needed because we halved the filter values.)
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
// Filter values are even, so halve to reduce intermediate precision reqs.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
#if AOM_ARCH_AARCH64
while (height > 8) {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
while (height > 8) {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
s += 7;
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
x_filter, horiz_const);
int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
x_filter, horiz_const);
int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
x_filter, horiz_const);
int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
x_filter, horiz_const);
int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
x_filter, horiz_const);
int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
x_filter, horiz_const);
int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
x_filter, horiz_const);
int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
x_filter, horiz_const);
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s1 = s9;
s2 = s10;
s3 = s11;
s4 = s12;
s5 = s13;
s6 = s14;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
}
#endif // AOM_ARCH_AARCH64
s += 7;
do {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
do {
uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const);
int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const);
int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const);
int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const);
int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
x_filter, horiz_const);
int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
x_filter, horiz_const);
int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
x_filter, horiz_const);
int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
x_filter, horiz_const);
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
x_filter, horiz_const);
transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
vst1q_s16(d, d0);
store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--height != 0);
s0 = s8;
s1 = s9;
s2 = s10;
s3 = s11;
s4 = s12;
s5 = s13;
s6 = s14;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
}
#endif // AOM_ARCH_AARCH64
do {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
do {
uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const);
vst1q_s16(d, d0);
s0 = s8;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--height != 0);
}
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@ -1355,7 +1587,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
const int im_h = h + clamped_y_taps - 1;
const int im_stride = MAX_SB_SIZE;
const int vert_offset = clamped_y_taps / 2 - 1;
@ -1385,12 +1618,20 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
x_filter_ptr);
if (x_filter_taps <= 4) {
convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
im_stride, w, im_h, x_filter_ptr);
} else {
convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
w, im_h, x_filter_ptr);
}
const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
if (clamped_y_taps <= 6) {
if (clamped_y_taps <= 4) {
convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
y_filter_ptr);
} else if (clamped_y_taps == 6) {
convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
y_filter);
} else {

108
third_party/aom/av1/common/arm/convolve_neon.h поставляемый
Просмотреть файл

@ -535,4 +535,112 @@ static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
}
}
static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t y_filter) {
int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
sum = vmlal_lane_s16(sum, s1, y_filter, 1);
sum = vmlal_lane_s16(sum, s2, y_filter, 2);
sum = vmlal_lane_s16(sum, s3, y_filter, 3);
return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
}
static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x4_t y_filter,
const int16x8_t sub_const) {
int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
int16x8_t res =
vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
res = vsubq_s16(res, sub_const);
return vqmovun_s16(res);
}
static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
int src_stride,
uint8_t *dst_ptr,
int dst_stride, int w, int h,
const int16_t *y_filter) {
const int bd = 8;
const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
const int16x4_t filter = vld1_s16(y_filter + 2);
if (w == 4) {
int16x4_t s0, s1, s2;
load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
src_ptr += 3 * src_stride;
do {
int16x4_t s3, s4, s5, s6;
load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
s0 = s4;
s1 = s5;
s2 = s6;
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else {
// Width is a multiple of 8 and height is a multiple of 4.
do {
int height = h;
int16_t *s = src_ptr;
uint8_t *d = dst_ptr;
int16x8_t s0, s1, s2;
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
s += 3 * src_stride;
do {
int16x8_t s3, s4, s5, s6;
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
s2 = s6;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
} while (w != 0);
}
}
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

183
third_party/aom/av1/common/arm/convolve_neon_i8mm.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,183 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
#include <arm_neon.h>
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_ports/mem.h"
DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
const int8x16_t filters,
const uint8x16x3_t permute_tbl,
int32x4_t horiz_const) {
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
// { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
vqtbl1q_u8(samples, permute_tbl.val[1]),
vqtbl1q_u8(samples, permute_tbl.val[2]) };
int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1);
sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2);
// Narrow and re-pack.
return vshrn_n_s32(sum, ROUND0_BITS);
}
static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
const int8x16_t filters,
const uint8x16x3_t permute_tbl,
const int32x4_t horiz_const) {
// Permute samples ready for dot product.
// { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
// { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
// { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
// {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
vqtbl1q_u8(samples[0], permute_tbl.val[1]),
vqtbl1q_u8(samples[0], permute_tbl.val[2]),
vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
int32x4_t sum0123 =
vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
int32x4_t sum4567 =
vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0);
sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
// Narrow and re-pack.
return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
vshrn_n_s32(sum4567, ROUND0_BITS));
}
static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
const int16x4_t x_filter_8_11) {
// The no-op filter should never be used here.
assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
const int bd = 8;
// Narrow filter values to 8-bit.
const int16x8x2_t x_filter_s16 = {
{ x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
};
const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
vmovn_s16(x_filter_s16.val[1]));
// This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
// - which are generally faster than rounding shifts on modern CPUs.
const int32x4_t horiz_const =
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
if (w <= 4) {
do {
uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h > 4);
do {
uint8x16_t s0 = vld1q_u8(src_ptr);
int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
vst1_s16(dst_ptr, d0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--h != 0);
} else {
do {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
do {
uint8x16_t s0[2], s1[2], s2[2], s3[2];
load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
int16x8_t d0 =
convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
int16x8_t d1 =
convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
int16x8_t d2 =
convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
int16x8_t d3 =
convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h > 4);
do {
const uint8_t *s = src_ptr;
int16_t *d = dst_ptr;
int width = w;
do {
uint8x16_t s0[2];
s0[0] = vld1q_u8(s);
s0[1] = vld1q_u8(s + 4);
int16x8_t d0 =
convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
vst1q_s16(d, d0);
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--h != 0);
}
}
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_

203
third_party/aom/av1/common/arm/convolve_sve2.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,203 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/arm/highbd_convolve_sve2.h"
#include "av1/common/arm/convolve_neon_i8mm.h"
static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
int16x8_t s1[2],
int16x8_t s2[2],
int16x8_t filter_0_7,
int16x8_t filter_4_11) {
int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
}
static INLINE void convolve_2d_sr_vert_12tap_sve2(
const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr,
const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
const int16x8_t y_filter_4_11) {
// The no-op filter should never be used here.
assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
const int bd = 8;
const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
// Scale indices by size of the true vector length to avoid reading from an
// 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
uint16x8_t correction0 =
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
uint16x8_t correction1 =
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
uint16x8_t correction2 =
vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
do {
int16_t *s = (int16_t *)src_ptr;
uint8_t *d = (uint8_t *)dst_ptr;
int height = h;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
&s9, &sA);
s += 11 * src_stride;
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
s6789[2], s789A[2];
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
transpose_concat_4x4(s0, s1, s2, s3, s0123);
transpose_concat_4x4(s1, s2, s3, s4, s1234);
transpose_concat_4x4(s2, s3, s4, s5, s2345);
transpose_concat_4x4(s3, s4, s5, s6, s3456);
transpose_concat_4x4(s4, s5, s6, s7, s4567);
transpose_concat_4x4(s5, s6, s7, s8, s5678);
transpose_concat_4x4(s6, s7, s8, s9, s6789);
transpose_concat_4x4(s7, s8, s9, sA, s789A);
do {
int16x4_t sB, sC, sD, sE;
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7,
y_filter_4_11);
int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7,
y_filter_4_11);
int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7,
y_filter_4_11);
int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7,
y_filter_4_11);
int16x8_t dd01 =
vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
int16x8_t dd23 =
vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
dd01 = vsubq_s16(dd01, sub_const);
dd23 = vsubq_s16(dd23, sub_const);
uint8x8_t d01 = vqmovun_s16(dd01);
uint8x8_t d23 = vqmovun_s16(dd23);
store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
// Prepare block for next iteration - re-using as much as possible.
// Shuffle everything up four rows.
s0123[0] = s4567[0];
s0123[1] = s4567[1];
s1234[0] = s5678[0];
s1234[1] = s5678[1];
s2345[0] = s6789[0];
s2345[1] = s6789[1];
s3456[0] = s789A[0];
s3456[1] = s789A[1];
s4567[0] = s89AB[0];
s4567[1] = s89AB[1];
s5678[0] = s9ABC[0];
s5678[1] = s9ABC[1];
s6789[0] = sABCD[0];
s6789[1] = sABCD[1];
s789A[0] = sBCDE[0];
s789A[1] = sBCDE[1];
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src_ptr += 4;
dst_ptr += 4;
w -= 4;
} while (w != 0);
}
void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
if (w == 2 || h == 2) {
av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
filter_params_x, filter_params_y, subpel_x_qn,
subpel_y_qn, conv_params);
return;
}
if (filter_params_x->taps > 8) {
const int im_h = h + filter_params_y->taps - 1;
const int im_stride = MAX_SB_SIZE;
const int vert_offset = filter_params_x->taps / 2 - 1;
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_y, subpel_y_qn & SUBPEL_MASK);
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
im_stride, w, im_h, x_filter_0_7,
x_filter_8_11);
convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h,
y_filter_0_7, y_filter_4_11);
} else {
av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
filter_params_x, filter_params_y, subpel_x_qn,
subpel_y_qn, conv_params);
}
}

Просмотреть файл

@ -562,11 +562,12 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
return vminq_u16(res, max);
}
void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
int width, int height,
const int16_t *filter_y, int bd) {
assert(w >= 4 && h >= 4);
static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width,
int height, const int16_t *filter_y,
int bd) {
assert(width >= 4 && height >= 4);
const int16x8_t y_filter = vld1q_s16(filter_y);
@ -731,11 +732,12 @@ static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
return vminq_u16(res, max);
}
void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
int width, int height,
const int16_t *filter_y, int bd) {
assert(w >= 4 && h >= 4);
static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width,
int height, const int16_t *filter_y,
int bd) {
assert(width >= 4 && height >= 4);
const int16x8_t y_filter =
vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
@ -1346,13 +1348,11 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
return vminq_u16(res, max);
}
void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width,
int height, const int16_t *filter_y,
ConvolveParams *conv_params, int bd,
const int y_offset) {
assert(w >= 4 && h >= 4);
static void highbd_convolve_2d_sr_vert_8tap_sve2(
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
ConvolveParams *conv_params, int bd, const int y_offset) {
assert(width >= 4 && height >= 4);
const int64x2_t offset = vdupq_n_s64(y_offset);
const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
const int16x8_t y_filter = vld1q_s16(filter_y);
@ -1536,13 +1536,11 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
return vminq_u16(res, max);
}
void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width,
int height, const int16_t *filter_y,
ConvolveParams *conv_params, int bd,
const int y_offset) {
assert(w >= 4 && h >= 4);
static void highbd_convolve_2d_sr_vert_4tap_sve2(
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
ConvolveParams *conv_params, int bd, const int y_offset) {
assert(width >= 4 && height >= 4);
const int64x2_t offset = vdupq_n_s64(y_offset);
const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);

Просмотреть файл

@ -13,6 +13,7 @@
#include <assert.h>
#include "aom_dsp/arm/sum_neon.h"
#include "config/av1_rtcd.h"
#define MAX_UPSAMPLE_SZ 16

Просмотреть файл

@ -13,6 +13,7 @@
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"

Просмотреть файл

@ -16,6 +16,7 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "av1/common/resize.h"
#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
#include "config/aom_scale_rtcd.h"
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,

Просмотреть файл

@ -1124,10 +1124,10 @@ static void final_filter_fast_internal(uint16_t *A, int32_t *B,
} while (h > 0);
}
void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
int16_t *src, const int src_stride, int32_t *dst,
const int dst_stride, const int width,
const int height) {
static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
int16_t *src, const int src_stride,
int32_t *dst, const int dst_stride,
const int width, const int height) {
int16x8_t s0;
int32_t *B_tmp, *dst_ptr;
uint16_t *A_tmp;

17
third_party/aom/av1/common/av1_rtcd_defs.pl поставляемый
Просмотреть файл

@ -470,7 +470,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon sve/;
}
}
@ -554,8 +554,13 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
}
add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
specialize qw/resize_vert_dir avx2/;
add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
specialize qw/av1_resize_vert_dir sse2 avx2/;
add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
# passes under 32-bit valgrind.
specialize qw/av1_resize_horz_dir avx2/;
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
@ -597,13 +602,13 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm sve2/;
specialize qw/av1_convolve_2d_sr_intrabc neon/;
specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_convolve_x_sr_intrabc neon/;
specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_convolve_y_sr_intrabc neon/;
specialize qw/av1_convolve_2d_scale sse4_1/;
specialize qw/av1_convolve_2d_scale sse4_1 neon/;
specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;

5
third_party/aom/av1/common/cfl.c поставляемый
Просмотреть файл

@ -159,8 +159,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
CFL_PREDICT_FN(c, lbd)
#if CONFIG_AV1_HIGHBITDEPTH
void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
int alpha_q3, int bit_depth, int width, int height) {
static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
int dst_stride, int alpha_q3,
int bit_depth, int width, int height) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] = clip_pixel_highbd(

27
third_party/aom/av1/common/cfl.h поставляемый
Просмотреть файл

@ -95,6 +95,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
// will be constant allowing for loop unrolling and other constant propagated
// goodness.
#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \
void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3); \
void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \
cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \
@ -170,6 +172,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
// will be constant allowing for loop unrolling and other constant propagated
// goodness.
#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
int16_t *dst); \
void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
int16_t *dst) { \
subtract_average_##arch(src, dst, width, height, round_offset, \
@ -220,22 +224,21 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
return sub_avg[tx_size % TX_SIZES_ALL]; \
}
// For VSX SIMD optimization, the C versions of width == 4 subtract are
// faster than the VSX. As such, the VSX code calls the C versions.
void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
#define CFL_PREDICT_lbd(arch, width, height) \
void cfl_predict_lbd_##width##x##height##_##arch( \
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
int alpha_q3) { \
cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
height); \
#define CFL_PREDICT_lbd(arch, width, height) \
void cfl_predict_lbd_##width##x##height##_##arch( \
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \
void cfl_predict_lbd_##width##x##height##_##arch( \
const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
int alpha_q3) { \
cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
height); \
}
#if CONFIG_AV1_HIGHBITDEPTH
#define CFL_PREDICT_hbd(arch, width, height) \
void cfl_predict_hbd_##width##x##height##_##arch( \
const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
int bd); \
void cfl_predict_hbd_##width##x##height##_##arch( \
const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
int bd) { \

5
third_party/aom/av1/common/debugmodes.c поставляемый
Просмотреть файл

@ -9,17 +9,21 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "av1/common/debugmodes.h"
#include <stdio.h>
#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/enums.h"
#if 0
static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
fprintf(f, "%s", str);
fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
cm->show_frame, cm->quant_params.base_qindex);
}
/* This function dereferences a pointer to the mbmi structure
* and uses the passed in member offset to print out the value of an integer
* for each mbmi member value in the mi structure.
@ -87,6 +91,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
fclose(mvs);
}
#endif // 0
void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
const char *filename) {

24
third_party/aom/av1/common/debugmodes.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,24 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AV1_COMMON_DEBUGMODES_H_
#define AOM_AV1_COMMON_DEBUGMODES_H_
#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/enums.h"
void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file);
void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
const char *filename);
void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename);
#endif // AOM_AV1_COMMON_DEBUGMODES_H_

4
third_party/aom/av1/common/ppc/cfl_ppc.c поставляемый
Просмотреть файл

@ -124,6 +124,10 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
// Based on observation, for small blocks VSX does not outperform C (no 64bit
// load and store intrinsics). So we call the C code for block widths 4.
extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
cfl_subtract_average_4x4_c, /* 4x4 */

30
third_party/aom/av1/common/resize.c поставляемый
Просмотреть файл

@ -337,8 +337,8 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
}
static void down2_symeven(const uint8_t *const input, int length,
uint8_t *output) {
void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
int start_offset) {
// Actual filter len = 2 * filter_len_half.
const int16_t *filter = av1_down2_symeven_half_filter;
const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
@ -350,7 +350,7 @@ static void down2_symeven(const uint8_t *const input, int length,
l2 += (l2 & 1);
if (l1 > l2) {
// Short input length.
for (i = 0; i < length; i += 2) {
for (i = start_offset; i < length; i += 2) {
int sum = (1 << (FILTER_BITS - 1));
for (j = 0; j < filter_len_half; ++j) {
sum +=
@ -362,7 +362,7 @@ static void down2_symeven(const uint8_t *const input, int length,
}
} else {
// Initial part.
for (i = 0; i < l1; i += 2) {
for (i = start_offset; i < l1; i += 2) {
int sum = (1 << (FILTER_BITS - 1));
for (j = 0; j < filter_len_half; ++j) {
sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
@ -492,7 +492,7 @@ static void resize_multistep(const uint8_t *const input, int length,
if (filteredlength & 1)
down2_symodd(in, filteredlength, out);
else
down2_symeven(in, filteredlength, out);
down2_symeven(in, filteredlength, out, 0);
filteredlength = proj_filteredlength;
}
if (filteredlength != olength) {
@ -521,8 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
}
}
bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
int height, int height2, int width2, int start_col) {
bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
int height, int height2, int width2, int start_col) {
bool mem_status = true;
uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@ -533,7 +533,7 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
for (int i = start_col; i < width2; ++i) {
fill_col_to_arr(intbuf + i, width2, height, arrbuf);
down2_symeven(arrbuf, height, arrbuf2);
down2_symeven(arrbuf, height, arrbuf2, 0);
fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
}
@ -543,10 +543,12 @@ Error:
return mem_status;
}
void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
int height, int filtered_length, int width2) {
void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride,
uint8_t *intbuf, int height, int filtered_length,
int width2) {
for (int i = 0; i < height; ++i)
down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i,
0);
}
bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
@ -558,10 +560,10 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
}
// Resize in the horizontal direction
resize_horz_dir(input, in_stride, intbuf, height, width, width2);
av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2);
// Resize in the vertical direction
bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
width2, 0 /*start_col*/);
bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height,
height2, width2, 0 /*start_col*/);
aom_free(intbuf);
return mem_status;
}

3
third_party/aom/av1/common/resize.h поставляемый
Просмотреть файл

@ -101,6 +101,9 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
int in_stride, uint8_t *output, int height2,
int width2, int out_stride);
void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
int start_offset);
bool should_resize_by_half(int height, int width, int height2, int width2);
// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.

Просмотреть файл

@ -16,6 +16,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/synonyms.h"
#include "av1/common/convolve.h"
static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
@ -200,31 +201,23 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
int res_int;
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
s[0] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
s[1] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
s[2] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
s[3] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
s[4] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
s[5] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
xx_loadl_32(src_ptr + 2 * src_stride));
s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
xx_loadl_32(src_ptr + 3 * src_stride));
s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {
s[6] = _mm_unpacklo_epi8(
src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
s[7] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
src6 = xx_loadl_32(src_ptr + 8 * src_stride);
s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
res = convolve_lo_y(s + 0, coeffs);
res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);

Просмотреть файл

@ -15,6 +15,7 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
#include "aom_dsp/x86/synonyms.h"
void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
@ -178,31 +179,23 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
if (w == 4) {
__m128i s[8], src6, res, res_shift;
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
s[0] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
s[1] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
s[2] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
s[3] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
s[4] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
s[5] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
xx_loadl_32(src_ptr + 2 * src_stride));
s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
xx_loadl_32(src_ptr + 3 * src_stride));
s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {
s[6] = _mm_unpacklo_epi8(
src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
s[7] = _mm_unpacklo_epi8(
_mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
src6 = xx_loadl_32(src_ptr + 8 * src_stride);
s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
res = convolve_lo_y(s + 0, coeffs);
res_shift = _mm_sll_epi32(res, left_shift);

Просмотреть файл

@ -576,7 +576,7 @@ void av1_build_compound_diffwtd_mask_highbd_avx2(
}
}
} else {
const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
if (mask_type == DIFFWTD_38_INV) {
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += 16) {

Просмотреть файл

@ -76,7 +76,7 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(
}
}
} else {
const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
if (mask_type == DIFFWTD_38_INV) {
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += 8) {

329
third_party/aom/av1/common/x86/resize_avx2.c поставляемый
Просмотреть файл

@ -41,7 +41,7 @@
s[8] = _mm256_unpackhi_epi8(s68, s79); \
\
__m256i res_out[2] = { 0 }; \
resize_y_convolve(s, coeffs_y, res_out); \
resize_convolve(s, coeffs_y, res_out); \
\
/* r00... r07 */ \
__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
@ -52,7 +52,7 @@
res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
\
__m256i res_out_b[2] = { 0 }; \
resize_y_convolve(s + 5, coeffs_y, res_out_b); \
resize_convolve(s + 5, coeffs_y, res_out_b); \
\
/* r08... r015 */ \
__m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
@ -91,7 +91,7 @@
s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \
\
__m256i res_out[2] = { 0 }; \
resize_y_convolve(s, coeffs_y, res_out); \
resize_convolve(s, coeffs_y, res_out); \
\
/* r00... r07 */ \
__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
@ -108,9 +108,107 @@
res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \
res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
static INLINE void resize_y_convolve(const __m256i *const s,
const __m256i *const coeffs,
__m256i *res_out) {
#define PROCESS_RESIZE_X_WD32 \
/* a0 a1 ..... a30 a31 */ \
__m256i row0 = _mm256_loadu_si256( \
(__m256i *)&input[i * in_stride + j - filter_offset]); \
/* b0 b1 ..... b30 b31 */ \
__m256i row1 = _mm256_loadu_si256( \
(__m256i *)&input[(i + 1) * in_stride + j - filter_offset]); \
/* a0 .... a15 || b0.... b15 */ \
__m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); \
/* a16 .... a31 || b16 .... b31 */ \
__m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); \
filter_offset = 3; \
\
/* Pad start pixels to the left, while processing the first pixels in the \
row. */ \
if (j == 0) { \
/* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \
row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \
/* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */ \
row1 = _mm256_alignr_epi8(r1, r0, 13); \
r0 = row0; \
r1 = row1; \
} \
\
/* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \
__m128i row0_0 = _mm_loadl_epi64( \
(__m128i *)&input[i * in_stride + 32 + j - filter_offset]); \
/* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \
__m128i row1_0 = _mm_loadl_epi64( \
(__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]); \
__m256i r2 = _mm256_permute2x128_si256( \
_mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \
\
/* Pad end pixels to the right, while processing the last pixels in the \
row. */ \
const int is_last_cols32 = (j + 32 == filtered_length); \
if (is_last_cols32) { \
r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask); \
} \
\
/* Process even pixels of the first row */ \
/* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */ \
s0[0] = _mm256_alignr_epi8(r1, r0, 0); \
/* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */ \
s0[1] = _mm256_alignr_epi8(r1, r0, 2); \
/* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */ \
s0[2] = _mm256_alignr_epi8(r1, r0, 4); \
/* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */ \
s0[3] = _mm256_alignr_epi8(r1, r0, 6); \
\
/* Process even pixels of the second row */ \
/* a13 a14 a15 a16 ..... a28 | b13 b14 b15 b16 ..... b28 */ \
s1[0] = _mm256_alignr_epi8(r2, r1, 0); \
/* a15 a16 a17 a18 ..... a30 | b15 b16 b17 b18 ..... b30 */ \
s1[1] = _mm256_alignr_epi8(r2, r1, 2); \
/* a17 a18 a19 a20 ..... a32 | b17 b18 b19 b20 ..... b32 */ \
s1[2] = _mm256_alignr_epi8(r2, r1, 4); \
/* a19 a20 a21 a22 ..... a34 | b19 b20 b21 b22 ..... b34 */ \
s1[3] = _mm256_alignr_epi8(r2, r1, 6); \
\
/* The register res_out_0 stores the result of start-16 pixels corresponding \
to the first and second rows whereas res_out_1 stores the end-16 pixels. */ \
__m256i res_out_0[2], res_out_1[2]; \
res_out_1[0] = res_out_1[1] = zero; \
res_out_0[0] = res_out_0[1] = zero; \
resize_convolve(s0, coeffs_x, res_out_0); \
resize_convolve(s1, coeffs_x, res_out_1); \
\
/* Result of 32 pixels of row0 (a0 to a32) */ \
res_out_0[0] = _mm256_sra_epi32( \
_mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); \
res_out_1[0] = _mm256_sra_epi32( \
_mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits); \
/* r00-r03 r08-r011 | r04-r07 r012-r015 */ \
__m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \
\
/* result of 32 pixels of row1 (b0 to b32) */ \
res_out_0[1] = _mm256_sra_epi32( \
_mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \
res_out_1[1] = _mm256_sra_epi32( \
_mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits); \
/* r10-r13 r18-r111 | r14-r17 r112-r115 */ \
__m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]); \
\
/* Convert the result from 16bit to 8bit */ \
/* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115 \
*/ \
__m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1); \
__m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel); \
res_out_row01 = _mm256_max_epu8(res_out_r01, zero); \
__m128i low_128 = CAST_LOW(res_out_row01); \
__m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1); \
\
_mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2], \
_mm_unpacklo_epi32(low_128, high_128)); \
_mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2], \
_mm_unpackhi_epi32(low_128, high_128));
static INLINE void resize_convolve(const __m256i *const s,
const __m256i *const coeffs,
__m256i *res_out) {
const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
@ -152,8 +250,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
}
bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
int height, int height2, int stride, int start_col) {
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
int height, int height2, int stride,
int start_col) {
assert(start_col <= stride);
// For the GM tool, the input layer height or width is assured to be an even
// number. Hence the function 'down2_symodd()' is not invoked and SIMD
@ -164,8 +263,8 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
// eliminate the need for conditional statements within the subsequent SIMD
// code to manage these cases.
if (height & 1 || height < 8) {
return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, start_col);
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, start_col);
}
__m256i s[10], coeffs_y[4];
@ -174,7 +273,7 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
const uint8_t max_pixel = 255;
const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
const __m256i zero = _mm256_setzero_si256();
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
@ -404,8 +503,212 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
}
if (remain_col)
return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, stride - remain_col);
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, stride - remain_col);
return true;
}
// Masks used for width 32 and 8 pixels, with left and right padding
// requirements
static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12,
0, 0, 0, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12 };
static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2,
0, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2 };
static const uint8_t wd8_right_padding_mask[32] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
};
void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
uint8_t *intbuf, int height, int filtered_length,
int width2) {
assert(height % 2 == 0);
// Invoke C for width less than 32.
// TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
// passes under 32-bit valgrind.
if (filtered_length < 32) {
av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
width2);
return;
}
const int filt_length = sizeof(av1_down2_symeven_half_filter);
assert(filt_length % 2 == 0);
(void)filt_length;
__m256i s0[4], s1[4], coeffs_x[4];
const int bits = FILTER_BITS;
const int dst_stride = width2;
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
const uint8_t max_pixel = 255;
const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
const __m256i zero = _mm256_setzero_si256();
const __m256i wd32_start_pad_mask =
_mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
const __m256i wd32_end_pad_mask =
_mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
const __m256i wd8_end_pad_mask =
_mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
// The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
// to generate output corresponding to 2 rows. To streamline the core loop and
// eliminate the need for conditional checks, the remaining columns (16 or 8)
// are processed separately.
if (filtered_length % 32 == 0) {
for (int i = 0; i < height; i += 2) {
int filter_offset = 0;
for (int j = 0; j < filtered_length; j += 32) {
PROCESS_RESIZE_X_WD32
}
}
} else {
for (int i = 0; i < height; i += 2) {
int filter_offset = 0;
int remain_col = filtered_length % 32;
for (int j = 0; j + 32 <= filtered_length; j += 32) {
PROCESS_RESIZE_X_WD32
}
int wd_processed = filtered_length - remain_col;
if (remain_col > 15) {
remain_col = filtered_length % 16;
const int in_idx = i * in_stride + wd_processed - filter_offset;
const int out_idx = (i * dst_stride) + wd_processed / 2;
// a0 a1 --- a15
__m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
// b0 b1 --- b15
__m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
// a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// a16 a17 --- a23
row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
// b16 b17 --- b23
row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
// a16-a23 x x x x| b16-b23 x x x x
__m256i r1 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// Pad end pixels to the right, while processing the last pixels in the
// row.
const int is_last_cols16 = wd_processed + 16 == filtered_length;
if (is_last_cols16) {
r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
}
// a0 a1 --- a15 || b0 b1 --- b15
s0[0] = r0;
// a2 a3 --- a17 || b2 b3 --- b17
s0[1] = _mm256_alignr_epi8(r1, r0, 2);
// a4 a5 --- a19 || b4 b5 --- b19
s0[2] = _mm256_alignr_epi8(r1, r0, 4);
// a6 a7 --- a21 || b6 b7 --- b21
s0[3] = _mm256_alignr_epi8(r1, r0, 6);
// result for 16 pixels (a0 to a15) of row0 and row1
__m256i res_out_0[2];
res_out_0[0] = res_out_0[1] = zero;
resize_convolve(s0, coeffs_x, res_out_0);
// r00 -r07
res_out_0[0] = _mm256_sra_epi32(
_mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
// r10-r17
res_out_0[1] = _mm256_sra_epi32(
_mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
// r00-r03 r10-r13 r04-r07 r14-r17
__m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
// r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
// r00-r03 r10-r13 r04-r07 r14-r17
__m128i low_result =
CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
// r00-r03 r04-r07 r10-r13 r14-r17
low_result = _mm_shuffle_epi32(low_result, 0xd8);
_mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
_mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
_mm_unpackhi_epi64(low_result, low_result));
}
wd_processed = filtered_length - remain_col;
if (remain_col > 7) {
remain_col = filtered_length % 8;
const int in_idx = i * in_stride + wd_processed - filter_offset;
const int out_idx = (i * dst_stride) + wd_processed / 2;
// a0 a1 --- a15
__m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
// b0 b1 --- b15
__m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
// a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// Pad end pixels to the right, while processing the last pixels in the
// row.
const int is_last_cols_8 = wd_processed + 8 == filtered_length;
if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
// a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
s0[0] = r0;
// a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
s0[1] = _mm256_bsrli_epi128(r0, 2);
// a4 a5 a6 a7 a8 a9 a10 a10 | b4 b5 b6 b7 b8 b9 b10 b10
s0[2] = _mm256_bsrli_epi128(r0, 4);
// a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
s0[3] = _mm256_bsrli_epi128(r0, 6);
__m256i res_out_0[2];
res_out_0[0] = res_out_0[1] = zero;
resize_convolve(s0, coeffs_x, res_out_0);
// r00 - r03 | r10 - r13
__m256i res_out =
_mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
// r00 - r03 | r10 - r13
res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
round_shift_bits);
// r00-r03 r00-r03 r10-r13 r10-r13
__m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
// r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
xx_storel_32(intbuf + out_idx + dst_stride,
_mm256_extracti128_si256(res_out_row01, 1));
}
wd_processed = filtered_length - remain_col;
// When the remaining width is 2, the above code would not have taken
// care of padding required for (filtered_length - 4)th pixel. Hence,
// process that pixel again with the C code.
wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
if (remain_col) {
const int in_idx = (in_stride * i);
const int out_idx = (wd_processed / 2) + width2 * i;
down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
wd_processed);
down2_symeven(input + in_idx + in_stride, filtered_length,
intbuf + out_idx + width2, wd_processed);
}
}
}
}

333
third_party/aom/av1/common/x86/resize_sse2.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,333 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include "config/av1_rtcd.h"
#include "av1/common/resize.h"
#include "aom_dsp/x86/synonyms.h"
#define PROCESS_RESIZE_Y_WD8 \
/* ah0 ah1 ... ah7 */ \
const __m128i AH = _mm_add_epi16(l0, l7); \
/* bg0 bg1 ... bh7 */ \
const __m128i BG = _mm_add_epi16(l1, l6); \
/* cf0 cf1 ... cf7 */ \
const __m128i CF = _mm_add_epi16(l2, l5); \
/* de0 de1 ... de7 */ \
const __m128i DE = _mm_add_epi16(l3, l4); \
\
/* ah0 bg0 ... ah3 bg3 */ \
const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \
/*cf0 de0 ... cf2 de2 */ \
const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \
\
/* ah4 bg4... ah7 bg7 */ \
const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \
/* cf4 de4... cf7 de7 */ \
const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \
\
/* r00 r01 r02 r03 */ \
const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \
const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \
__m128i r0 = _mm_add_epi32(r00, r01); \
/* r04 r05 r06 r07 */ \
const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \
const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \
__m128i r1 = _mm_add_epi32(r10, r11); \
\
r0 = _mm_add_epi32(r0, round_const_bits); \
r1 = _mm_add_epi32(r1, round_const_bits); \
r0 = _mm_sra_epi32(r0, round_shift_bits); \
r1 = _mm_sra_epi32(r1, round_shift_bits); \
\
/* r00 ... r07 (8 values of each 16bit) */ \
const __m128i res_16b = _mm_packs_epi32(r0, r1); \
/* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \
const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \
\
__m128i res = _mm_min_epu8(res_8b0, clip_pixel); \
res = _mm_max_epu8(res, zero); \
_mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
\
l0 = l2; \
l1 = l3; \
l2 = l4; \
l3 = l5; \
l4 = l6; \
l5 = l7; \
data += 2 * stride;
static INLINE void prepare_filter_coeffs(const int16_t *filter,
__m128i *const coeffs /* [2] */) {
// f0 f1 f2 f3 x x x x
const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
// f1 f0 f3 f2 x x x x
const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
// f3 f2 f3 f2 ...
coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
// f1 f0 f1 f0 ...
coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
}
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
int height, int height2, int stride,
int start_col) {
// For the GM tool, the input layer height or width is assured to be an even
// number. Hence the function 'down2_symodd()' is not invoked and SIMD
// optimization of the same is not implemented.
// When the input height is less than 8 and even, the potential input
// heights are limited to 2, 4, or 6. These scenarios require seperate
// handling due to padding requirements. Invoking the C function here will
// eliminate the need for conditional statements within the subsequent SIMD
// code to manage these cases.
if (height & 1 || height < 8) {
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, start_col);
}
__m128i coeffs_y[2];
const int bits = FILTER_BITS;
const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
const uint8_t max_pixel = 255;
const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
const __m128i zero = _mm_setzero_si128();
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
const int remain_col = stride % 8;
for (int j = start_col; j < stride - remain_col; j += 8) {
uint8_t *data = &intbuf[j];
// d0 ... d7
const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
// Padding top 3 rows with the last available row at the top.
// a0 ... a7
const __m128i l8_0 = l8_3;
// b0 ... b7
const __m128i l8_1 = l8_3;
// c0 ... c7
const __m128i l8_2 = l8_3;
// e0 ... e7
const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
// f0 ... f7
const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
// Convert to 16bit as addition of 2 source pixel crosses 8 bit.
__m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit)
__m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit)
__m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit)
__m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit)
__m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit)
__m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit)
// Increment the pointer such that the loading starts from row G.
data = data + 3 * stride;
// The core vertical SIMD processes 2 input rows simultaneously to generate
// output corresponding to 1 row. To streamline the core loop and eliminate
// the need for conditional checks, the remaining rows 4 are processed
// separately.
for (int i = 0; i < height - 4; i += 2) {
// g0 ... g7
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
// h0 ... h7
__m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b)
__m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b)
PROCESS_RESIZE_Y_WD8
}
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
// Process the last 4 input rows here.
for (int i = height - 4; i < height; i += 2) {
__m128i l7 = l6;
PROCESS_RESIZE_Y_WD8
}
}
if (remain_col)
return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, stride - remain_col);
return true;
}
// Blends a and b using mask and returns the result.
static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
const __m128i masked_b = _mm_and_si128(mask, b);
const __m128i masked_a = _mm_andnot_si128(mask, a);
return (_mm_or_si128(masked_a, masked_b));
}
// Masks used for width 16 pixels, with left and right padding
// requirements.
static const uint8_t left_padding_mask[16] = {
255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const uint8_t right_padding_mask[16] = { 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 255, 255,
255, 255, 255, 255 };
static const uint8_t mask_16[16] = {
255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
};
void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
uint8_t *intbuf, int height, int filtered_length,
int width2) {
assert(height % 2 == 0);
// Invoke C for width less than 16.
if (filtered_length < 16) {
av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
width2);
return;
}
__m128i coeffs_x[2];
const int bits = FILTER_BITS;
const int dst_stride = width2;
const int remain_col = filtered_length % 16;
const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
const uint8_t max_pixel = 255;
const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
const __m128i zero = _mm_setzero_si128();
const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
for (int i = 0; i < height; ++i) {
int filter_offset = 0;
for (int j = 0; j <= filtered_length - 16; j += 16) {
const int in_idx = i * in_stride + j - filter_offset;
const int out_idx = i * dst_stride + j / 2;
// a0 a1 a2 a3 .... a15
__m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
// a8 a9 a10 a11 .... a23
__m128i row01 =
_mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
filter_offset = 3;
// Pad start pixels to the left, while processing the first pixels in the
// row.
if (j == 0) {
const __m128i start_pixel_row0 =
_mm_set1_epi8((char)input[i * in_stride]);
row00 =
blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
}
// Pad end pixels to the right, while processing the last pixels in the
// row.
const int is_last_cols16 = (j == filtered_length - 16);
if (is_last_cols16) {
const __m128i end_pixel_row0 =
_mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
row01 = blend(row01, end_pixel_row0, end_pad_mask);
}
// a2 a3 a4 a5 a6 a7 a8 a9 .... a17
const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
_mm_srli_si128(row01, 2));
// a4 a5 a6 a7 a9 10 a11 a12 .... a19
const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
_mm_srli_si128(row01, 4));
// a6 a7 a8 a9 a10 a11 a12 a13 .... a21
const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
_mm_srli_si128(row01, 6));
// a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
const __m128i s0 = _mm_and_si128(row00, mask_even);
// a1 a3 a5 a7 a9 a11 a13 a15
const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
// a2 a4 a6 a8 a10 a12 a14 a16
const __m128i s2 = _mm_and_si128(row0_1, mask_even);
// a3 a5 a7 a9 a11 a13 a15 a17
const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
// a4 a6 a8 a10 a12 a14 a16 a18
const __m128i s4 = _mm_and_si128(row0_2, mask_even);
// a5 a7 a9 a11 a13 a15 a17 a19
const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
// a6 a8 a10 a12 a14 a16 a18 a20
const __m128i s6 = _mm_and_si128(row0_3, mask_even);
// a7 a9 a11 a13 a15 a17 a19 a21
const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
// a0a7 a2a9 a4a11 .... a12a19 a14a21
const __m128i s07 = _mm_add_epi16(s0, s7);
// a1a6 a3a8 a5a10 .... a13a18 a15a20
const __m128i s16 = _mm_add_epi16(s1, s6);
// a2a5 a4a7 a6a9 .... a14a17 a16a19
const __m128i s25 = _mm_add_epi16(s2, s5);
// a3a4 a5a6 a7a8 .... a15a16 a17a18
const __m128i s34 = _mm_add_epi16(s3, s4);
// a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
// a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
// a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
// a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
// Result of first 8 pixels of row0 (a0 to a7).
// r0_0 r0_1 r0_2 r0_3
__m128i r00 = _mm_add_epi32(r01_0, r01_1);
r00 = _mm_add_epi32(r00, round_const_bits);
r00 = _mm_sra_epi32(r00, round_shift_bits);
// Result of next 8 pixels of row0 (a8 to 15).
// r0_4 r0_5 r0_6 r0_7
__m128i r01 = _mm_add_epi32(r01_2, r01_3);
r01 = _mm_add_epi32(r01, round_const_bits);
r01 = _mm_sra_epi32(r01, round_shift_bits);
// r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
const __m128i res_16 = _mm_packs_epi32(r00, r01);
const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
__m128i res = _mm_min_epu8(res_8, clip_pixel);
res = _mm_max_epu8(res, zero);
// r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
_mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
}
int wd_processed = filtered_length - remain_col;
// When the remaining width is 2, the above code would not have taken
// care of padding required for (filtered_length - 4)th pixel. Hence,
// process that pixel again with the C code.
wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
if (remain_col) {
const int in_idx = (in_stride * i);
const int out_idx = (wd_processed / 2) + width2 * i;
down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
wd_processed);
}
}
}

6
third_party/aom/av1/decoder/decodeframe.c поставляемый
Просмотреть файл

@ -2241,6 +2241,12 @@ static AOM_INLINE void get_ls_tile_buffer(
if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
// The remaining bits in the top byte signal the row offset
int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
if (offset > row) {
aom_internal_error(
error_info, AOM_CODEC_CORRUPT_FRAME,
"Invalid row offset in tile copy mode: row=%d offset=%d", row,
offset);
}
// Currently, only use tiles in same column as reference tiles.
copy_data = tile_buffers[row - offset][col].data;

Просмотреть файл

@ -13,6 +13,7 @@
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/mem_neon.h"
@ -63,7 +64,7 @@ int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
}
int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
int block_size) {
intptr_t block_size) {
uint64x2_t err_u64 = vdupq_n_u64(0);
assert(block_size >= 16);

Просмотреть файл

@ -12,6 +12,7 @@
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
@ -49,7 +50,7 @@ int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
}
int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
int block_size) {
intptr_t block_size) {
if (block_size % 32 == 0) {
int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
vdupq_n_s64(0) };

Просмотреть файл

@ -12,7 +12,7 @@
#include "aom_dsp/arm/sum_neon.h"
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));

Просмотреть файл

Просмотреть файл

Просмотреть файл

@ -19,6 +19,7 @@
#include <stdint.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#define CRC_LOOP(op, crc, type, buf, len) \
while ((len) >= sizeof(type)) { \

Просмотреть файл

@ -15,7 +15,7 @@
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#include "av1/encoder/arm/neon/pickrst_neon.h"
#include "av1/encoder/arm/pickrst_neon.h"
#include "av1/encoder/pickrst.h"
static INLINE void highbd_calc_proj_params_r0_r1_neon(

441
third_party/aom/av1/encoder/arm/highbd_pickrst_sve.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,441 @@
/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <arm_sve.h>
#include <string.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "av1/common/restoration.h"
#include "av1/encoder/pickrst.h"
#include "av1/encoder/arm/pickrst_sve.h"
static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
int width, int height) {
uint64x2_t avg_u64 = vdupq_n_u64(0);
uint16x8_t ones = vdupq_n_u16(1);
// Use a predicate to compute the last columns.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
int h = height;
do {
int j = width;
const uint16_t *src_ptr = src;
while (j > 8) {
uint16x8_t s = vld1q_u16(src_ptr);
avg_u64 = aom_udotq_u16(avg_u64, s, ones);
j -= 8;
src_ptr += 8;
}
uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr));
avg_u64 = aom_udotq_u16(avg_u64, s_end, ones);
src += src_stride;
} while (--h != 0);
return (uint16_t)(vaddvq_u64(avg_u64) / (width * height));
}
static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
int16_t avg, int16_t *buf_avg,
int buf_avg_stride, int width, int height) {
uint16x8_t avg_u16 = vdupq_n_u16(avg);
// Use a predicate to compute the last columns.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg));
do {
int j = width;
const uint16_t *buf_ptr = buf;
int16_t *buf_avg_ptr = buf_avg;
while (j > 8) {
uint16x8_t d = vld1q_u16(buf_ptr);
vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16)));
j -= 8;
buf_ptr += 8;
buf_avg_ptr += 8;
}
uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr));
vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end)));
buf += buf_stride;
buf_avg += buf_avg_stride;
} while (--height > 0);
}
static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
const int wiener_win2,
const int divider) {
for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
// Transpose the first 2x2 square. It needs a special case as the element
// of the bottom left is on the diagonal.
int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
// Transpose and store all the remaining 2x2 squares of the line.
for (int j = i + 3; j < wiener_win2; j = j + 2) {
row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
vst1q_s64(H_tmp + (j + 0) * wiener_win2 + i, tr_row0);
vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
}
}
for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
H[i] += H_tmp[i] / divider;
}
}
// Transpose the matrix that has just been computed and accumulate it in M.
static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
const int wiener_win, const int divider) {
for (int i = 0; i < wiener_win; ++i) {
for (int j = 0; j < wiener_win; ++j) {
int tr_idx = j * wiener_win + i;
*M++ += (int64_t)(M_trn[tr_idx] / divider);
}
}
}
// This function computes two matrices: the cross-correlation between the src
// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
//
// M is of size 7 * 7. It needs to be filled such that multiplying one element
// from src with each element of a row of the wiener window will fill one
// column of M. However this is not very convenient in terms of memory
// accesses, as it means we do contiguous loads of dgd but strided stores to M.
// As a result, we use an intermediate matrix M_trn which is instead filled
// such that one row of the wiener window gives one row of M_trn. Once fully
// computed, M_trn is then transposed to return M.
//
// H is of size 49 * 49. It is filled by multiplying every pair of elements of
// the wiener window together. Since it is a symmetric matrix, we only compute
// the upper triangle, and then copy it down to the lower one. Here we fill it
// by taking each different pair of columns, and multiplying all the elements of
// the first one with all the elements of the second one, with a special case
// when multiplying a column by itself.
static INLINE void highbd_compute_stats_win7_sve(
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
const int wiener_win = 7;
const int wiener_win2 = wiener_win * wiener_win;
// Use a predicate to compute the last columns of the block for H.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
// Use intermediate matrices for H and M to perform the computation, they
// will be accumulated into the original H and M at the end.
int64_t M_trn[49];
memset(M_trn, 0, sizeof(M_trn));
int64_t H_tmp[49 * 49];
memset(H_tmp, 0, sizeof(H_tmp));
do {
// Cross-correlation (M).
for (int row = 0; row < wiener_win; row++) {
int j = 0;
while (j < width) {
int16x8_t dgd[7];
load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
&dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
int16x8_t s = vld1q_s16(src_avg + j);
// Compute all the elements of one row of M.
compute_M_one_row_win7(s, dgd, M_trn, row);
j += 8;
}
}
// Auto-covariance (H).
int j = 0;
while (j < width - 8) {
for (int col0 = 0; col0 < wiener_win; col0++) {
int16x8_t dgd0[7];
load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
&dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
// Perform computation of the first column with itself (28 elements).
// For the first column this will fill the upper triangle of the 7x7
// matrix at the top left of the H matrix. For the next columns this
// will fill the upper triangle of the other 7x7 matrices around H's
// diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done.
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
// Load second column and scale based on downsampling factor.
int16x8_t dgd1[7];
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
// Compute all elements from the combination of both columns (49
// elements).
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
}
}
j += 8;
}
// Process remaining columns using a predicate to discard excess elements.
for (int col0 = 0; col0 < wiener_win; col0++) {
// Load first column.
int16x8_t dgd0[7];
dgd0[0] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
dgd0[1] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
dgd0[2] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
dgd0[3] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
dgd0[4] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
dgd0[5] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
dgd0[6] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
// Perform computation of the first column with itself (28 elements).
// For the first column this will fill the upper triangle of the 7x7
// matrix at the top left of the H matrix. For the next columns this
// will fill the upper triangle of the other 7x7 matrices around H's
// diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done.
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
// Load second column and scale based on downsampling factor.
int16x8_t dgd1[7];
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
// Compute all elements from the combination of both columns (49
// elements).
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
}
}
dgd_avg += dgd_avg_stride;
src_avg += src_avg_stride;
} while (--height != 0);
// Transpose M_trn.
acc_transpose_M(M, M_trn, 7, bit_depth_divider);
// Copy upper triangle of H in the lower one.
copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
}
// This function computes two matrices: the cross-correlation between the src
// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
//
// M is of size 5 * 5. It needs to be filled such that multiplying one element
// from src with each element of a row of the wiener window will fill one
// column of M. However this is not very convenient in terms of memory
// accesses, as it means we do contiguous loads of dgd but strided stores to M.
// As a result, we use an intermediate matrix M_trn which is instead filled
// such that one row of the wiener window gives one row of M_trn. Once fully
// computed, M_trn is then transposed to return M.
//
// H is of size 25 * 25. It is filled by multiplying every pair of elements of
// the wiener window together. Since it is a symmetric matrix, we only compute
// the upper triangle, and then copy it down to the lower one. Here we fill it
// by taking each different pair of columns, and multiplying all the elements of
// the first one with all the elements of the second one, with a special case
// when multiplying a column by itself.
static INLINE void highbd_compute_stats_win5_sve(
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
const int wiener_win = 5;
const int wiener_win2 = wiener_win * wiener_win;
// Use a predicate to compute the last columns of the block for H.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
// Use intermediate matrices for H and M to perform the computation, they
// will be accumulated into the original H and M at the end.
int64_t M_trn[25];
memset(M_trn, 0, sizeof(M_trn));
int64_t H_tmp[25 * 25];
memset(H_tmp, 0, sizeof(H_tmp));
do {
// Cross-correlation (M).
for (int row = 0; row < wiener_win; row++) {
int j = 0;
while (j < width) {
int16x8_t dgd[5];
load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
&dgd[2], &dgd[3], &dgd[4]);
int16x8_t s = vld1q_s16(src_avg + j);
// Compute all the elements of one row of M.
compute_M_one_row_win5(s, dgd, M_trn, row);
j += 8;
}
}
// Auto-covariance (H).
int j = 0;
while (j < width - 8) {
for (int col0 = 0; col0 < wiener_win; col0++) {
// Load first column.
int16x8_t dgd0[5];
load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
&dgd0[2], &dgd0[3], &dgd0[4]);
// Perform computation of the first column with itself (15 elements).
// For the first column this will fill the upper triangle of the 5x5
// matrix at the top left of the H matrix. For the next columns this
// will fill the upper triangle of the other 5x5 matrices around H's
// diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done.
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
// Load second column and scale based on downsampling factor.
int16x8_t dgd1[5];
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4]);
// Compute all elements from the combination of both columns (25
// elements).
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
}
}
j += 8;
}
// Process remaining columns using a predicate to discard excess elements.
for (int col0 = 0; col0 < wiener_win; col0++) {
int16x8_t dgd0[5];
dgd0[0] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
dgd0[1] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
dgd0[2] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
dgd0[3] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
dgd0[4] = svget_neonq_s16(
svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
// Perform computation of the first column with itself (15 elements).
// For the first column this will fill the upper triangle of the 5x5
// matrix at the top left of the H matrix. For the next columns this
// will fill the upper triangle of the other 5x5 matrices around H's
// diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done.
for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
// Load second column and scale based on downsampling factor.
int16x8_t dgd1[5];
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4]);
// Compute all elements from the combination of both columns (25
// elements).
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
}
}
dgd_avg += dgd_avg_stride;
src_avg += src_avg_stride;
} while (--height != 0);
// Transpose M_trn.
acc_transpose_M(M, M_trn, 5, bit_depth_divider);
// Copy upper triangle of H in the lower one.
copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
}
void av1_compute_stats_highbd_sve(int wiener_win, const uint8_t *dgd8,
const uint8_t *src8, int16_t *dgd_avg,
int16_t *src_avg, int h_start, int h_end,
int v_start, int v_end, int dgd_stride,
int src_stride, int64_t *M, int64_t *H,
aom_bit_depth_t bit_depth) {
assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
const int wiener_win2 = wiener_win * wiener_win;
const int wiener_halfwin = wiener_win >> 1;
const int32_t width = h_end - h_start;
const int32_t height = v_end - v_start;
uint8_t bit_depth_divider = 1;
if (bit_depth == AOM_BITS_12)
bit_depth_divider = 16;
else if (bit_depth == AOM_BITS_10)
bit_depth_divider = 4;
const uint16_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
const uint16_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
// dgd_avg and src_avg have been memset to zero before calling this function
// so round up the stride to the next multiple of 8 so that we don't have to
// worry about a tail loop when computing M.
const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
const int src_avg_stride = (width & ~7) + 8;
// Compute (dgd - avg) and store it in dgd_avg.
// The wiener window will slide along the dgd frame, centered on each pixel.
// For the top left pixel and all the pixels on the side of the frame this
// means half of the window will be outside of the frame. As such the actual
// buffer that we need to subtract the avg from will be 2 * wiener_halfwin
// wider and 2 * wiener_halfwin higher than the original dgd buffer.
const int vert_offset = v_start - wiener_halfwin;
const int horiz_offset = h_start - wiener_halfwin;
const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
width + 2 * wiener_halfwin, height + 2 * wiener_halfwin);
// Compute (src - avg), downsample if necessary and store in src-avg.
const uint16_t *src_start = src + h_start + v_start * src_stride;
compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
height);
if (wiener_win == WIENER_WIN) {
highbd_compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg,
src_avg_stride, width, height, M, H,
bit_depth_divider);
} else {
highbd_compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg,
src_avg_stride, width, height, M, H,
bit_depth_divider);
}
}

Просмотреть файл

@ -12,6 +12,7 @@
#include <arm_neon.h>
#include "aom_dsp/txfm_common.h"
#include "config/av1_rtcd.h"
static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
int32x4x2_t b0 =

Просмотреть файл

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше