Bug 1899864 - Update libaom to a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 r=media-playback-reviewers,alwu

This patch simply runs the command below ``` ./mach vendor media/libaom/moz.yaml --patch-mode=none ``` to update the libaom source. Differential Revision: https://phabricator.services.mozilla.com/D212162
2024-05-31 00:33:33 +00:00 · 2024-05-31 00:33:33 +00:00 · c3dcb83cf6
--- a/media/libaom/config/generic/config/av1_rtcd.h
+++ b/media/libaom/config/generic/config/av1_rtcd.h
@ -532,6 +532,12 @@ void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_
 void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 #define av1_resize_and_extend_frame av1_resize_and_extend_frame_c

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+#define av1_resize_horz_dir av1_resize_horz_dir_c
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+#define av1_resize_vert_dir av1_resize_vert_dir_c
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 #define av1_round_shift_array av1_round_shift_array_c

@ -624,9 +630,6 @@ cfl_predict_lbd_fn cfl_get_predict_lbd_fn_c(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
 #define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_c

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-#define resize_vert_dir resize_vert_dir_c
-
 void av1_rtcd(void);

 #include "config/aom_config.h"
--- a/media/libaom/config/linux/arm/config/av1_rtcd.h
+++ b/media/libaom/config/linux/arm/config/av1_rtcd.h
@ -221,7 +221,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
 RTCD_EXTERN void (*av1_compute_stats_highbd)(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth);

 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
-#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);

 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@ -687,6 +688,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+#define av1_resize_horz_dir av1_resize_horz_dir_c
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+#define av1_resize_vert_dir av1_resize_vert_dir_c
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -813,9 +820,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-#define resize_vert_dir resize_vert_dir_c
-
 void av1_rtcd(void);

 #include "config/aom_config.h"
@ -870,6 +874,8 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_NEON) av1_compute_stats = av1_compute_stats_neon;
    av1_compute_stats_highbd = av1_compute_stats_highbd_c;
    if (flags & HAS_NEON) av1_compute_stats_highbd = av1_compute_stats_highbd_neon;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_NEON) av1_convolve_2d_scale = av1_convolve_2d_scale_neon;
    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
    if (flags & HAS_NEON) av1_convolve_2d_sr = av1_convolve_2d_sr_neon;
    av1_convolve_2d_sr_intrabc = av1_convolve_2d_sr_intrabc_c;
--- a/media/libaom/config/linux/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h
@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-
 void av1_rtcd(void);

 #ifdef RTCD_C
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
+    av1_resize_horz_dir = av1_resize_horz_dir_c;
+    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
+    av1_resize_vert_dir = av1_resize_vert_dir_c;
+    if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
+    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
    av1_round_shift_array = av1_round_shift_array_c;
    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
    av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
    if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
-    resize_vert_dir = resize_vert_dir_c;
-    if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
 }
 #endif

--- a/media/libaom/config/linux/x64/config/av1_rtcd.h
+++ b/media/libaom/config/linux/x64/config/av1_rtcd.h
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-
 void av1_rtcd(void);

 #ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
+    av1_resize_horz_dir = av1_resize_horz_dir_c;
+    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
+    av1_resize_vert_dir = av1_resize_vert_dir_sse2;
+    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
    av1_round_shift_array = av1_round_shift_array_c;
    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
    av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
-    resize_vert_dir = resize_vert_dir_c;
-    if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
 }
 #endif

--- a/media/libaom/config/mac/arm64/config/av1_rtcd.h
+++ b/media/libaom/config/mac/arm64/config/av1_rtcd.h
@ -207,7 +207,8 @@ void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, const ui
 #define av1_compute_stats_highbd av1_compute_stats_highbd_neon

 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
-#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_scale av1_convolve_2d_scale_neon

 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@ -234,7 +235,9 @@ void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t

 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
-#define av1_convolve_y_sr av1_convolve_y_sr_neon
+void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);

 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn);
@ -682,6 +685,12 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 #define av1_resize_and_extend_frame av1_resize_and_extend_frame_neon

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+#define av1_resize_horz_dir av1_resize_horz_dir_c
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+#define av1_resize_vert_dir av1_resize_vert_dir_c
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
 #define av1_round_shift_array av1_round_shift_array_neon
@ -807,9 +816,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_c(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_neon(TX_SIZE tx_size);
 #define cfl_get_subtract_average_fn cfl_get_subtract_average_fn_neon

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-#define resize_vert_dir resize_vert_dir_c
-
 void av1_rtcd(void);

 #include "config/aom_config.h"
@ -830,6 +836,9 @@ static void setup_rtcd_internal(void)
    av1_convolve_x_sr = av1_convolve_x_sr_neon;
    if (flags & HAS_NEON_DOTPROD) av1_convolve_x_sr = av1_convolve_x_sr_neon_dotprod;
    if (flags & HAS_NEON_I8MM) av1_convolve_x_sr = av1_convolve_x_sr_neon_i8mm;
+    av1_convolve_y_sr = av1_convolve_y_sr_neon;
+    if (flags & HAS_NEON_DOTPROD) av1_convolve_y_sr = av1_convolve_y_sr_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) av1_convolve_y_sr = av1_convolve_y_sr_neon_i8mm;
    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon;
    if (flags & HAS_NEON_DOTPROD) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_dotprod;
    if (flags & HAS_NEON_I8MM) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_neon_i8mm;
--- a/media/libaom/config/mac/x64/config/av1_rtcd.h
+++ b/media/libaom/config/mac/x64/config/av1_rtcd.h
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-
 void av1_rtcd(void);

 #ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
+    av1_resize_horz_dir = av1_resize_horz_dir_c;
+    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
+    av1_resize_vert_dir = av1_resize_vert_dir_sse2;
+    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
    av1_round_shift_array = av1_round_shift_array_c;
    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
    av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
-    resize_vert_dir = resize_vert_dir_c;
-    if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
 }
 #endif

--- a/media/libaom/config/win/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/win/ia32/config/av1_rtcd.h
@ -711,6 +711,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -879,10 +888,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-
 void av1_rtcd(void);

 #ifdef RTCD_C
@ -1140,6 +1145,11 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
+    av1_resize_horz_dir = av1_resize_horz_dir_c;
+    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
+    av1_resize_vert_dir = av1_resize_vert_dir_c;
+    if (flags & HAS_SSE2) av1_resize_vert_dir = av1_resize_vert_dir_sse2;
+    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
    av1_round_shift_array = av1_round_shift_array_c;
    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
    av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1240,8 +1250,6 @@ static void setup_rtcd_internal(void)
    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_c;
    if (flags & HAS_SSE2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
-    resize_vert_dir = resize_vert_dir_c;
-    if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
 }
 #endif

--- a/media/libaom/config/win/x64/config/av1_rtcd.h
+++ b/media/libaom/config/win/x64/config/av1_rtcd.h
@ -701,6 +701,15 @@ void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CO
 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);
 RTCD_EXTERN void (*av1_resize_and_extend_frame)(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes);

+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+RTCD_EXTERN void (*av1_resize_horz_dir)(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2);
+
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+RTCD_EXTERN bool (*av1_resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit);
 void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
 RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
@ -857,10 +866,6 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_sse2(TX_SIZE tx_size);
 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size);
 RTCD_EXTERN cfl_subtract_average_fn (*cfl_get_subtract_average_fn)(TX_SIZE tx_size);

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-RTCD_EXTERN bool (*resize_vert_dir)(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col);
-
 void av1_rtcd(void);

 #ifdef RTCD_C
@ -1090,6 +1095,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
+    av1_resize_horz_dir = av1_resize_horz_dir_c;
+    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
+    av1_resize_vert_dir = av1_resize_vert_dir_sse2;
+    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
    av1_round_shift_array = av1_round_shift_array_c;
    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
    av1_selfguided_restoration = av1_selfguided_restoration_c;
@ -1173,8 +1182,6 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
-    resize_vert_dir = resize_vert_dir_c;
-    if (flags & HAS_AVX2) resize_vert_dir = resize_vert_dir_avx2;
 }
 #endif

--- a/media/libaom/moz.yaml
+++ b/media/libaom/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 23c94347d84241c322f3b40daf120047ff4f8d56 (Wed Apr 17 11:05:14 2024 +0000).
+  release: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07 (Wed May 29 23:21:38 2024 +0000).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 23c94347d84241c322f3b40daf120047ff4f8d56
+  revision: a7ef80c44bfb34b08254194b1ab72d4e93ff4b07

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libaom/sources.mozbuild
+++ b/media/libaom/sources.mozbuild
@ -117,6 +117,7 @@ files = {
    '../../third_party/aom/av1/av1_cx_iface.c',
    '../../third_party/aom/av1/av1_dx_iface.c',
    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
    '../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
@ -184,24 +185,24 @@ files = {
    '../../third_party/aom/av1/encoder/aq_complexity.c',
    '../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
    '../../third_party/aom/av1/encoder/aq_variance.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
+    '../../third_party/aom/av1/encoder/arm/cnn_neon.c',
+    '../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
+    '../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
+    '../../third_party/aom/av1/encoder/arm/ml_neon.c',
+    '../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
+    '../../third_party/aom/av1/encoder/arm/quantize_neon.c',
+    '../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
+    '../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
+    '../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
+    '../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
    '../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
    '../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
    '../../third_party/aom/av1/encoder/av1_noise_estimate.c',
@ -394,6 +395,7 @@ files = {
    '../../third_party/aom/av1/av1_cx_iface.c',
    '../../third_party/aom/av1/av1_dx_iface.c',
    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/arm/av1_convolve_scale_neon.c',
    '../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
@ -466,26 +468,26 @@ files = {
    '../../third_party/aom/av1/encoder/aq_complexity.c',
    '../../third_party/aom/av1/encoder/aq_cyclicrefresh.c',
    '../../third_party/aom/av1/encoder/aq_variance.c',
-    '../../third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_error_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/cnn_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/ml_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/pickrst_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/rdopt_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c',
-    '../../third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c',
-    '../../third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_error_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_fwd_txfm2d_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_highbd_quantize_neon.c',
+    '../../third_party/aom/av1/encoder/arm/av1_k_means_neon.c',
+    '../../third_party/aom/av1/encoder/arm/cnn_neon.c',
+    '../../third_party/aom/av1/encoder/arm/encodetxb_neon.c',
+    '../../third_party/aom/av1/encoder/arm/hash_arm_crc32.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_fwd_txfm_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_pickrst_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_rdopt_neon.c',
+    '../../third_party/aom/av1/encoder/arm/highbd_temporal_filter_neon.c',
+    '../../third_party/aom/av1/encoder/arm/hybrid_fwd_txfm_neon.c',
+    '../../third_party/aom/av1/encoder/arm/ml_neon.c',
+    '../../third_party/aom/av1/encoder/arm/pickrst_neon.c',
+    '../../third_party/aom/av1/encoder/arm/quantize_neon.c',
+    '../../third_party/aom/av1/encoder/arm/rdopt_neon.c',
+    '../../third_party/aom/av1/encoder/arm/reconinter_enc_neon.c',
+    '../../third_party/aom/av1/encoder/arm/temporal_filter_neon.c',
+    '../../third_party/aom/av1/encoder/arm/temporal_filter_neon_dotprod.c',
+    '../../third_party/aom/av1/encoder/arm/wedge_utils_neon.c',
    '../../third_party/aom/av1/encoder/av1_fwd_txfm1d.c',
    '../../third_party/aom/av1/encoder/av1_fwd_txfm2d.c',
    '../../third_party/aom/av1/encoder/av1_noise_estimate.c',
@ -811,7 +813,6 @@ files = {
    '../../third_party/aom/aom_dsp/variance.c',
    '../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
    '../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
-    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
@ -969,6 +970,7 @@ files = {
    '../../third_party/aom/av1/common/x86/reconinter_sse4.c',
    '../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
    '../../third_party/aom/av1/common/x86/resize_avx2.c',
+    '../../third_party/aom/av1/common/x86/resize_sse2.c',
    '../../third_party/aom/av1/common/x86/resize_ssse3.c',
    '../../third_party/aom/av1/common/x86/selfguided_avx2.c',
    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
@ -1162,7 +1164,6 @@ files = {
    '../../third_party/aom/aom_dsp/variance.c',
    '../../third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c',
    '../../third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c',
-    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
@ -1322,6 +1323,7 @@ files = {
    '../../third_party/aom/av1/common/x86/reconinter_sse4.c',
    '../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
    '../../third_party/aom/av1/common/x86/resize_avx2.c',
+    '../../third_party/aom/av1/common/x86/resize_sse2.c',
    '../../third_party/aom/av1/common/x86/resize_ssse3.c',
    '../../third_party/aom/av1/common/x86/selfguided_avx2.c',
    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
--- a/third_party/aom/.mailmap
+++ b/third_party/aom/.mailmap
@ -40,6 +40,7 @@ Iole Moccagatta <iole.moccagatta@gmail.com>
 Jacky Chen <jackychen@google.com>
 James Zern <jzern@google.com> <jzern@google.cOm>
 Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
+Jian Zhou <zhoujian@fb.com> <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
--- a/third_party/aom/AUTHORS
+++ b/third_party/aom/AUTHORS
@ -51,6 +51,7 @@ Cyril Concolato <cconcolato@netflix.com>
 Dake He <dkhe@google.com>
 Damon Shen <yjshen@google.com>
 Dandan Ding <vickyddding@gmail.com>
+Daniel Cheng <dcheng@chromium.org>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Daniel Max Valenzuela <daniel.vt@samsung.com>
@ -94,6 +95,7 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
 Hamsalekha S <hamsalekha.s@ittiam.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Hari Limaye <hari.limaye@arm.com>
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
@ -124,7 +126,7 @@ Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
-Jian Zhou <zhoujian@google.com>
+Jian Zhou <zhoujian@fb.com>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
 Joe Young <joeyoung@google.com>
@ -216,6 +218,7 @@ Peter Boström <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
+Philippe Antoine <p.antoine@catenacyber.fr>
 Priit Laes <plaes@plaes.org>
 Qiu Jianlin <jianlin.qiu@intel.com>
 Rachel Barker <rachelbarker@google.com>
--- a/third_party/aom/CHANGELOG
+++ b/third_party/aom/CHANGELOG
@ -1,3 +1,91 @@
+2024-04-09 v3.9.0
+  This release includes new codec interfaces, compression efficiency and
+  perceptual improvements, speedup for RTC for both video and screen content,
+  and many bug fixes. This release is ABI compatible with the previous release.
+
+  - New Features
+    * New codec control
+      * AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to
+        only drop spatial layers or the whole superframe.
+    * Active Map is fixed and tested for RTC.
+    * CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom
+      decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when
+      both are disabled.
+    * libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4.
+
+  - Compression Efficiency Improvements
+    * RTC encoding improvements
+      * 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate
+        gain on scrolling content.
+
+  - Perceptual Quality Improvements
+    * For RTC screen content
+      * Reduced color artifacts for RTC screen content
+      * Visual quality improved for scene changes for SVC with quality layers.
+      * Removed visual artifacts for speed 11
+
+  - Speedups:
+    * RTC Speed 11: aggressive speedup setting added for video mode,
+      resolutions <= VGA: ~30% faster than speed 10.
+    * 5-9% speed up for high bit-depth encoding with good mode on Arm, half of
+      which comes from SVE/SVE2 optimizations.
+
+  - Other improvements
+    * Further improvements to global motion estimation.
+    * Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm.
+    * Remove unneeded SIMD functions, saving >100 KiB from binary size.
+    * Cleaned up and improved pattern_search.
+    * Added end-to-end c vs SIMD bit-exactness test.
+    * Added config flag to calc psnr using libvmaf peak: use a slightly
+      different peak value for PSNR (1020 and 2040 for 10- and 12-bit)
+
+  - Bug Fixes
+    * Fuzzing bug fixes
+      * b/329485898 Null-dereference WRITE in av1_cdef_frame_mt
+      * b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16
+      * b/329813868 Ill in av1_cdef_frame_mt
+      * chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row
+      * b/330014723 Null-dereference WRITE in
+        cdef_copy_rect8_16bit_to_16bit_avx2
+      * b/310455204 Null-dereference WRITE in prepare_enc_workers
+      * b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2
+      * oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <=
+        (pbi->output_frame_width_in_tiles_minus_1 + 1)
+      * oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w
+      * oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h
+      * oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize
+      * oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in
+        od_ec_decode_bool_q15
+      * oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init
+      * oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in
+        od_ec_dec_normalize
+      * oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in
+        get_ls_tile_buffers
+    * libaom library
+      * aomedia:3510 Large value of duration could cause encoder overflow
+      * chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx
+        in Win ARM64 builds
+      * aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after
+        59c592bb8
+      * aomedia:3531 Exception encountered with PSNR calculation
+      * aomedia:3541 Can not compile correctly by CYGWIN
+      * chromium:41482688 heap-buffer-overflow write in vpx_img_read()
+        (tools_common.c) with VPX_IMG_FMT_NV12
+      * aomedia:3521 Assertion failures on Arm in CNNTest.* in
+        av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and
+        av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon
+      * aomedia:3486 C vs NEON mismatch in AV1 encoder
+      * aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon()
+      * aomedia:3276 Significant progress on ensuring all allocations are
+        checked
+      * aomedia:3491 heap-buffer-overflow encoding frames of size 256x256,
+        512x512 in good quality usage mode using 4 threads
+      * aomedia:3322 PSNR number discrepancy
+      * aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni
+      * aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on
+        aom/av1/encoder/motion_search_facade.c
+      * aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case
+
 2024-03-08 v3.8.2
  This release includes several bug fixes. This release is ABI
  compatible with the last release. See
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@ -58,9 +58,9 @@ endif()
 # passed to libtool.
 #
 # We set SO_FILE_VERSION = [c-a].a.r
-set(LT_CURRENT 11)
-set(LT_REVISION 2)
-set(LT_AGE 8)
+set(LT_CURRENT 12)
+set(LT_REVISION 0)
+set(LT_AGE 9)
 math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
 set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
 unset(LT_CURRENT)
--- a/third_party/aom/aom/aom_encoder.h
+++ b/third_party/aom/aom/aom_encoder.h
@ -637,6 +637,7 @@ typedef struct aom_codec_enc_cfg {
  /*!\brief Target data rate
   *
   * Target bitrate to use for this stream, in kilobits per second.
+   * Max allowed value is 2000000
   */
  unsigned int rc_target_bitrate;

--- a/third_party/aom/aom/src/aom_image.c
+++ b/third_party/aom/aom/src/aom_image.c
@ -182,7 +182,9 @@ static aom_image_t *img_alloc_helper(

  /* Default viewport to entire image. (This aom_img_set_rect call always
   * succeeds.) */
-  aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  assert(ret == 0);
+  (void)ret;
  return img;

 fail:
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@ -58,7 +58,6 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2

 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@ -231,29 +232,6 @@ static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src,
  }
 }

-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t filter) {
-  int16x4_t sum = vmul_lane_s16(s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
-
-  return sum;
-}
-
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x4_t filter) {
-  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter, 3);
-
-  // We halved the filter values so -1 from right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
                                             ptrdiff_t src_stride, uint8_t *dst,
                                             ptrdiff_t dst_stride,
@ -265,26 +243,20 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,

  if (w == 4) {
    do {
-      int16x8_t t0 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride)));
-      int16x8_t t1 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride)));
+      uint8x8_t t01[4];

-      int16x4_t s0[4], s1[4];
-      s0[0] = vget_low_s16(t0);
-      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
-      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
-      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+      t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
+      t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
+      t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
+      t01[3] = load_unaligned_u8(src + 3, (int)src_stride);

-      s1[0] = vget_low_s16(t1);
-      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
-      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
-      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));

-      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
-      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
-      // We halved the filter values so -1 from right shift.
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter);

      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);

@ -298,37 +270,27 @@ static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
      const uint8_t *s = src;
      uint8_t *d = dst;

-      int16x8_t t0 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
-      int16x8_t t1 =
-          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
-
-      s += 8;
      do {
-        int16x8_t t2 =
-            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride)));
-        int16x8_t t3 =
-            vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride)));
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);

        int16x8_t s0[4], s1[4];
-        s0[0] = t0;
-        s0[1] = vextq_s16(t0, t2, 1);
-        s0[2] = vextq_s16(t0, t2, 2);
-        s0[3] = vextq_s16(t0, t2, 3);
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));

-        s1[0] = t1;
-        s1[1] = vextq_s16(t1, t3, 1);
-        s1[2] = vextq_s16(t1, t3, 2);
-        s1[3] = vextq_s16(t1, t3, 3);
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));

        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);

        store_u8_8x2(d, dst_stride, d0, d1);

-        t0 = t2;
-        t1 = t3;
-
        s += 8;
        d += 8;
        width -= 8;
@ -354,7 +316,12 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,

  src -= ((SUBPEL_TAPS / 2) - 1);

-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
    convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
                              h);
  } else {
@ -362,22 +329,13 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
+static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
  const int16x8_t filter = vld1q_s16(filter_y);

-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
  if (w == 4) {
    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@ -472,3 +430,30 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
    } while (w != 0);
  }
 }
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h);
+  }
+}
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.h
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.h
@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
+                                             ptrdiff_t src_stride, uint8_t *dst,
+                                             ptrdiff_t dst_stride,
+                                             const int16_t *filter_x, int w,
+                                             int h) {
+  // Bilinear filter values are all positive.
+  const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]);
+  const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]);
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0 =
+          load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride);
+      uint8x8_t s1 =
+          load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride);
+      uint8x8_t s2 =
+          load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride);
+      uint8x8_t s3 =
+          load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0);
+      uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1);
+      uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0);
+      uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      vst1_u8(dst + 0 * dst_stride, d0);
+      vst1_u8(dst + 1 * dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s + 0);
+        uint8x16_t s1 = vld1q_u8(s + 1);
+
+        uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
+        sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
+        uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
+        sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
+
+        uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+        uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h > 0);
+  }
+}
+
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
+      uint8_t *d = dst;
+
+      do {
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve8_vert_2tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
+  // Bilinear filter values are all positive.
+  uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]);
+  uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]);
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+      uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+      uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
+      uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0, s1, s2;
+      load_u8_8x3(src, src_stride, &s0, &s1, &s2);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s1, f0);
+      sum1 = vmlal_u8(sum1, s2, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      vst1_u8(dst + 0 * dst_stride, d0);
+      vst1_u8(dst + 1 * dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s + 0 * src_stride);
+        uint8x16_t s1 = vld1q_u8(s + 1 * src_stride);
+
+        uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
+        sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
+        uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
+        sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
+
+        uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+        uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h > 0);
+  }
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@ -20,6 +20,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@ -93,22 +95,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
  return vqrshrun_n_s16(sum, FILTER_BITS);
 }

-void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h) {
+static INLINE void convolve8_horiz_8tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));

-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
  if (w == 4) {
    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
    do {
@ -158,6 +149,141 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  // (Divide by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_WEIGHT to account for range transform.
+  // (Divide by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_horiz_4tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int w = width;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
+    convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
+                                      filter_x, w, h);
+  } else {
+    convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride,
+                                      filter_x, w, h);
+  }
+}
+
 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                        int8x8_t a3, int8x16_t *b) {
  // Transpose 8-bit elements and concatenate result rows as follows:
@ -244,24 +370,13 @@ static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
  return vqrshrun_n_s16(sum, FILTER_BITS);
 }

-void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h) {
+static INLINE void convolve8_vert_8tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
  int8x16x2_t samples_LUT;

-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
  if (w == 4) {
    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@ -410,3 +525,31 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
    } while (w != 0);
  }
 }
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y,
+                                     w, h);
+  }
+}
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@ -19,6 +19,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
+#include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
@ -80,22 +82,11 @@ static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
  return vqrshrun_n_s16(sum, FILTER_BITS);
 }

-void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
+static INLINE void convolve8_horiz_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));

-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
  if (w == 4) {
    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
    do {
@ -145,6 +136,128 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_horiz_4tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
+  const int16x4_t x_filter = vld1_s16(filter_x + 2);
+  // All 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+  if (width == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
+
+    do {
+      int w = width;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        w -= 8;
+      } while (w != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
+    convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
+                                   filter_x, w, h);
+  } else {
+    convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
+                                   w, h);
+  }
+}
+
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                        uint8x8_t a2, uint8x8_t a3,
                                        uint8x16_t *b) {
@ -227,24 +340,13 @@ static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
  return vqrshrun_n_s16(sum, FILTER_BITS);
 }

-void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+static INLINE void convolve8_vert_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
  uint8x16x2_t samples_LUT;

-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
  if (w == 4) {
    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@ -365,3 +467,31 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
    } while (w != 0);
  }
 }
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else if (filter_taps == 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
+                                  h);
+  }
+}
--- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@ -20,8 +20,9 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/blend.h"

-uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
-                                    uint16x8_t round_offset) {
+static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a,
+                                           uint16x8_t b,
+                                           uint16x8_t round_offset) {
  const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);

  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
--- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@ -12,6 +12,7 @@
 #include <arm_neon.h>

 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"

 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/arm/mem_neon.h"
@ -115,6 +116,7 @@ void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
  vst1q_s16(final_output + 1 * 8, out_23);
 }

+#if CONFIG_INTERNAL_STATS
 void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
  // stage 1
  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@ -302,3 +304,4 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
    vst1q_s16(&final_output[7 * 8], input_7);
  }
 }
+#endif  // CONFIG_INTERNAL_STATS
--- a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
@ -19,199 +19,208 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/highbd_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"

-static INLINE int32x4_t highbd_convolve8_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filter, const uint16x4_t max) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);

-  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+  int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_hi, 3);

-  return sum;
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+
+  return vmin_u16(res, max);
 }

-static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
-  int32x4_t sum =
-      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filter, const uint16x8_t max) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);

-  return vqrshrun_n_s32(sum, FILTER_BITS);
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+
+  return vminq_u16(res, max);
 }

-static INLINE int32x4_t highbd_convolve8_horiz4_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  const int16x8_t s2 = vextq_s16(s0, s1, 1);
-  const int16x8_t s3 = vextq_s16(s0, s1, 2);
-  const int16x8_t s4 = vextq_s16(s0, s1, 3);
-  const int16x4_t s0_lo = vget_low_s16(s0);
-  const int16x4_t s1_lo = vget_low_s16(s2);
-  const int16x4_t s2_lo = vget_low_s16(s3);
-  const int16x4_t s3_lo = vget_low_s16(s4);
-  const int16x4_t s4_lo = vget_high_s16(s0);
-  const int16x4_t s5_lo = vget_high_s16(s2);
-  const int16x4_t s6_lo = vget_high_s16(s3);
-  const int16x4_t s7_lo = vget_high_s16(s4);
-
-  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
-                                s7_lo, x_filter_0_7);
-}
-
-static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
-
-  return vqrshrun_n_s32(sum, FILTER_BITS);
-}
-
-static INLINE void highbd_convolve8_8_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
-    int32x4_t *sum0, int32x4_t *sum1) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
-  *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
-
-  *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
-}
-
-static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
-                                               const int16x8_t s0_hi,
-                                               const int16x8_t x_filter_0_7,
-                                               int32x4_t *sum0,
-                                               int32x4_t *sum1) {
-  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
-  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
-  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
-  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
-  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
-  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
-  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
-
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
-                         sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
-  int32x4_t sum0, sum1;
-  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
-}
-
-static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
-                         &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
-                      vqrshrun_n_s32(sum1, FILTER_BITS));
-}
-
-static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
-                                       ptrdiff_t src_stride, uint16_t *dst_ptr,
-                                       ptrdiff_t dst_stride,
-                                       const int16_t *x_filter_ptr,
-                                       int x_step_q4, int w, int h, int bd) {
+static void highbd_convolve_horiz_8tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
  assert(w >= 4 && h >= 4);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);

  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    const int16_t *s = (const int16_t *)src_ptr;
    uint16_t *d = dst_ptr;

    do {
-      int16x8_t s0, s1, s2, s3;
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);

-      uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
-      uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], x_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], x_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], x_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], x_filter, max);

-      uint16x8_t d01 = vcombine_u16(d0, d1);
-      d01 = vminq_u16(d01, max);
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);

-      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
    } while (h > 0);
  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    int height = h;

    do {
      int width = w;
      const int16_t *s = (const int16_t *)src_ptr;
      uint16_t *d = dst_ptr;
-      int x_q4 = 0;
-
-      const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
-      int16x8_t s0, s2, s4, s6;
-      load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
-      src_x += 8;

      do {
-        int16x8_t s1, s3, s5, s7;
-        load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);

-        uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
-        uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
-        uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
-        uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], x_filter, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], x_filter, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], x_filter, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], x_filter, max);

        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);

-        s0 = s1;
-        s2 = s3;
-        s4 = s5;
-        s6 = s7;
-        src_x += 8;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+static void highbd_convolve_horiz_4tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
        d += 8;
        width -= 8;
-        x_q4 += 8 * x_step_q4;
      } while (width > 0);
      src_ptr += 4 * src_stride;
      dst_ptr += 4 * dst_stride;
@ -236,21 +245,30 @@ void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

    src -= SUBPEL_TAPS / 2 - 1;
-    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, w, h, bd);
+
+    const int filter_taps = get_filter_taps_convolve8(filter_x);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
+                                       filter_x, w, h, bd);
+    } else if (filter_taps == 4) {
+      highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride,
+                                      filter_x, w, h, bd);
+    } else {
+      highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride,
+                                      filter_x, w, h, bd);
+    }
  }
 }

-static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
-                                      ptrdiff_t src_stride, uint16_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      const int16_t *y_filter_ptr, int w, int h,
-                                      int bd) {
+static void highbd_convolve_vert_8tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
  assert(w >= 4 && h >= 4);
  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    const int16_t *s = (const int16_t *)src_ptr;
    uint16_t *d = dst_ptr;

@ -263,24 +281,15 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);

      uint16x4_t d0 =
-          highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
      uint16x4_t d1 =
-          highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
      uint16x4_t d2 =
-          highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
      uint16x4_t d3 =
-          highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);

-      uint16x8_t d01 = vcombine_u16(d0, d1);
-      uint16x8_t d23 = vcombine_u16(d2, d3);
-
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-      vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);

      s0 = s4;
      s1 = s5;
@ -289,11 +298,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
      s4 = s8;
      s5 = s9;
      s6 = s10;
+
      s += 4 * src_stride;
      d += 4 * dst_stride;
      h -= 4;
    } while (h > 0);
  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
    do {
      int height = h;
      const int16_t *s = (const int16_t *)src_ptr;
@ -307,19 +319,14 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
        int16x8_t s7, s8, s9, s10;
        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);

-        uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
-                                                   s7, y_filter);
-        uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
-                                                   s8, y_filter);
-        uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
-                                                   s9, y_filter);
-        uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
-                                                   s10, y_filter);
-
-        d0 = vminq_u16(d0, max);
-        d1 = vminq_u16(d1, max);
-        d2 = vminq_u16(d2, max);
-        d3 = vminq_u16(d3, max);
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);

        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);

@ -330,6 +337,7 @@ static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
        s4 = s8;
        s5 = s9;
        s6 = s10;
+
        s += 4 * src_stride;
        d += 4 * dst_stride;
        height -= 4;
@ -357,7 +365,18 @@ void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

    src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
-    highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
-                              bd);
+
+    const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else if (filter_taps == 4) {
+      highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else {
+      highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y,
+                                     w, h, bd);
+    }
  }
 }
--- a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.h
@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void highbd_convolve8_horiz_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
+        uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
+        uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
+        uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s2, f0);
+        sum23 = vmlaq_u16(sum23, s3, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filter, 0);
+  sum = vmlal_lane_s16(sum, s1, filter, 1);
+  sum = vmlal_lane_s16(sum, s2, filter, 2);
+  sum = vmlal_lane_s16(sum, s3, filter, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_vert_4tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max);
+        uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max);
+        uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max);
+        uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+static INLINE void highbd_convolve8_vert_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0, s1, s2;
+        load_u16_8x3(s, src_stride, &s0, &s1, &s2);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s1, f0);
+        sum23 = vmlaq_u16(sum23, s2, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+#endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
--- a/third_party/aom/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_sve.c
@ -18,6 +18,7 @@

 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/highbd_convolve8_neon.h"
 #include "aom_dsp/arm/mem_neon.h"

 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
@ -252,7 +253,12 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,

  src -= SUBPEL_TAPS / 2 - 1;

-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  const int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride,
+                                     filter_x, width, height, bd);
+  } else if (filter_taps == 4) {
    highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride,
                                    filter_x, width, height, bd);
  } else {
@ -534,134 +540,13 @@ static INLINE void highbd_convolve8_vert_8tap_sve(
  }
 }

-static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter,
-                                              uint16x4_t max) {
-  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
-  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
-
-  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
-
-  return vmin_u16(res, max);
-}
-
-static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter,
-                                              uint16x8_t max) {
-  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
-  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
-  int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
-  int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
-
-  int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
-  int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
-
-  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS),
-                                vqrshrun_n_s32(s4567, FILTER_BITS));
-
-  return vminq_u16(res, max);
-}
-
-static INLINE void highbd_convolve8_vert_4tap_sve(
-    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
-    int bd) {
-  const int16x8_t y_filter =
-      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
-
-  uint8x16_t merge_block_tbl[3];
-  merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
-  merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
-  merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
-
-  if (width == 4) {
-    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
-    int16_t *s = (int16_t *)src;
-
-    int16x4_t s0, s1, s2;
-    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
-    s += 3 * src_stride;
-
-    do {
-      int16x4_t s3, s4, s5, s6;
-      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
-
-      // This operation combines a conventional transpose and the sample permute
-      // required before computing the dot product.
-      int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
-      transpose_concat_4x4(s0, s1, s2, s3, s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, s3456);
-
-      uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max);
-      uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max);
-      uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max);
-      uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max);
-
-      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
-
-      // Shuffle everything up four rows.
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-
-      s += 4 * src_stride;
-      dst += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    do {
-      int h = height;
-      int16_t *s = (int16_t *)src;
-      uint16_t *d = dst;
-
-      int16x8_t s0, s1, s2;
-      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
-      s += 3 * src_stride;
-
-      do {
-        int16x8_t s3, s4, s5, s6;
-        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
-
-        // This operation combines a conventional transpose and the sample
-        // permute required before computing the dot product.
-        int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
-        transpose_concat_8x4(s0, s1, s2, s3, s0123);
-        transpose_concat_8x4(s1, s2, s3, s4, s1234);
-        transpose_concat_8x4(s2, s3, s4, s5, s2345);
-        transpose_concat_8x4(s3, s4, s5, s6, s3456);
-
-        uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max);
-        uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max);
-        uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max);
-        uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        // Shuffle everything up four rows.
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-      src += 8;
-      dst += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
 void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
                                   uint8_t *dst8, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int width, int height, int bd) {
  assert(y_step_q4 == 16);
-  assert(w >= 4 && h >= 4);
+  assert(width >= 4 && height >= 4);
  (void)filter_x;
  (void)y_step_q4;
  (void)x_step_q4;
@ -671,9 +556,14 @@ void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,

  src -= (SUBPEL_TAPS / 2 - 1) * src_stride;

-  if (get_filter_taps_convolve8(filter_y) <= 4) {
-    highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst,
-                                   dst_stride, filter_y, width, height, bd);
+  const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
+  } else if (filter_taps == 4) {
+    highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
  } else {
    highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y,
                                   width, height, bd);
--- a/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
@ -1201,7 +1201,7 @@ HIGHBD_SMOOTH_H_NXM(8, 32)

 // For width 16 and above.
 #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
-  void highbd_smooth_h_##W##xh_neon(                                          \
+  static void highbd_smooth_h_##W##xh_neon(                                   \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
      const uint16_t *const left_column, const int height) {                  \
    const uint16_t top_right = top_row[(W)-1];                                \
@ -1293,6 +1293,33 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
      highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
 }

+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
+#if AOM_ARCH_AARCH64
+  return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
+#endif
+}
+
 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh,
@ -1336,13 +1363,26 @@ static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
    } else {
      int c = 0;
      do {
-        const uint16x8_t a0 = vld1q_u16(&above[base + c]);
-        const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
-        const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
-        const uint16x8_t cmp =
-            vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
-        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
-        vst1q_u16(dst + c, res);
+        uint16x8_t a0;
+        uint16x8_t a1;
+        if (base + c >= max_base_x) {
+          a0 = a1 = vdupq_n_u16(above_max);
+        } else {
+          if (base + c + 7 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c;
+            a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a0 = vld1q_u16(above + base + c);
+          }
+          if (base + c + 8 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c - 1;
+            a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a1 = vld1q_u16(above + base + c + 1);
+          }
+        }
+
+        vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
        c += 8;
      } while (c < bw);
    }
@ -2456,13 +2496,29 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
    val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
    uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
    val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
-    const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base));          \
-    const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),    \
-                                        vrshrn_n_u32(val_hi, (shift)));   \
-    *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res,      \
-                       vdupq_n_u16(left_max));                            \
+    *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
+                          vrshrn_n_u32(val_hi, (shift)));                 \
  } while (0)

+static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
+                                             int max_ofs) {
+  uint16x8_t r0;
+  uint16x8_t r1;
+  if (ofs + 7 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs;
+    r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r0 = vld1q_u16(left0 + ofs);
+  }
+  if (ofs + 8 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs - 1;
+    r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r1 = vld1q_u16(left0 + ofs + 1);
+  }
+  return (uint16x8x2_t){ { r0, r1 } };
+}
+
 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh, const uint16_t *left,
@ -2561,34 +2617,30 @@ static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
        if (base0 >= max_base_y) {
          out[0] = vdupq_n_u16(left_max);
        } else {
-          const uint16x8_t l00 = vld1q_u16(left + base0);
-          const uint16x8_t l01 = vld1q_u16(left1 + base0);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
-                                         shifts0, shifts1, 0, 6);
+          const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
+                                         l0.val[1], shifts0, shifts1, 0, 6);
        }
        if (base1 >= max_base_y) {
          out[1] = vdupq_n_u16(left_max);
        } else {
-          const uint16x8_t l10 = vld1q_u16(left + base1);
-          const uint16x8_t l11 = vld1q_u16(left1 + base1);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
-                                         shifts0, shifts1, 1, 6);
+          const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
+                                         l1.val[1], shifts0, shifts1, 1, 6);
        }
        if (base2 >= max_base_y) {
          out[2] = vdupq_n_u16(left_max);
        } else {
-          const uint16x8_t l20 = vld1q_u16(left + base2);
-          const uint16x8_t l21 = vld1q_u16(left1 + base2);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
-                                         shifts0, shifts1, 2, 6);
+          const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
+                                         l2.val[1], shifts0, shifts1, 2, 6);
        }
        if (base3 >= max_base_y) {
          out[3] = vdupq_n_u16(left_max);
        } else {
-          const uint16x8_t l30 = vld1q_u16(left + base3);
-          const uint16x8_t l31 = vld1q_u16(left1 + base3);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
-                                         shifts0, shifts1, 3, 6);
+          const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
+                                         l3.val[1], shifts0, shifts1, 3, 6);
        }
        transpose_array_inplace_u16_4x8(out);
        for (int r2 = 0; r2 < 4; ++r2) {
--- a/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
+++ b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
@ -14,6 +14,7 @@
 #include <string.h>

 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"

 #include "aom_dsp/quantize.h"

--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@ -15,6 +15,7 @@

 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
@ -1356,6 +1357,41 @@ static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
  }
 }

+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
+   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
+   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
+   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vld1q_u8(ptr);
+#if AOM_ARCH_AARCH64
+  return vqtbl1q_u8(src, shuffle);
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vcombine_u8(lo, hi);
+#endif
+}
+
 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above, int dx) {
  const int frac_bits = 6;
@ -1369,7 +1405,6 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5

  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
-  const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);

  int x = dx;
  for (int r = 0; r < N; r++, dst += stride) {
@ -1391,12 +1426,24 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                               vcreate_u8(0x0F0E0D0C0B0A0908)));

    for (int j = 0; j < 64; j += 16) {
-      int mdif = max_base_x - (base + j);
-      if (mdif <= 0) {
+      if (base + j >= max_base_x) {
        vst1q_u8(dst + j, a_mbase_x);
      } else {
-        uint8x16_t a0_128 = vld1q_u8(above + base + j);
-        uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+        uint8x16_t a0_128;
+        uint8x16_t a1_128;
+        if (base + j + 15 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j;
+          a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a0_128 = vld1q_u8(above + base + j);
+        }
+        if (base + j + 16 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j - 1;
+          a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a1_128 = vld1q_u8(above + base + j + 1);
+        }
+
        uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
        uint16x8_t diff_hi =
            vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
@ -1406,13 +1453,8 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
            vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
        uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
        uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
-        uint8x16_t v_temp =
-            vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
-
-        uint8x16_t mask128 =
-            vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
-        uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
-        vst1q_u8(dst + j, res128);
+        vst1q_u8(dst + j,
+                 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));

        base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
      }
--- a/third_party/aom/aom_dsp/arm/mem_neon.h
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@ -174,6 +174,16 @@ static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
  *s3 = vld1_u8(s);
 }

+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
                                uint16x4_t *const s0, uint16x4_t *const s1,
                                uint16x4_t *const s2, uint16x4_t *const s3) {
@ -221,6 +231,16 @@ static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
  *s1 = vld1q_u16(s);
 }

+static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+}
+
 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
                                uint16x8_t *const s0, uint16x8_t *const s1,
                                uint16x8_t *const s2, uint16x8_t *const s3) {
@ -634,6 +654,13 @@ static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
  vst1q_s16(s, s3);
 }

+static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+}
+
 static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,
@ -1026,6 +1053,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
  *s7 = vld1q_u8(s);
 }

+static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+}
+
 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
                                uint8x16_t *const s0, uint8x16_t *const s1,
                                uint8x16_t *const s2, uint8x16_t *const s3) {
@ -1038,6 +1080,16 @@ static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
  *s3 = vld1q_u8(s);
 }

+static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@ -1228,6 +1280,12 @@ static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
    memcpy(dst, &a, 8);                                            \
  } while (0)

+#define store_s16_4x1_lane(dst, src, lane)                        \
+  do {                                                            \
+    int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
+    memcpy(dst, &a, 8);                                           \
+  } while (0)
+
 // Store the low 16-bits from a single vector.
 static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
  store_u8_2x1_lane(dst, src, 0);
@ -1287,9 +1345,18 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
  store_u16_4x1_lane(dst, src, 1);
 }

+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
+                                          int16x8_t src) {
+  store_s16_4x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_s16_4x1_lane(dst, src, 1);
+}
+
 #undef store_u8_2x1_lane
 #undef store_u8_4x1_lane
 #undef store_u16_2x1_lane
 #undef store_u16_4x1_lane
+#undef store_s16_4x1_lane

 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
--- a/third_party/aom/aom_dsp/arm/subtract_neon.c
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@ -12,6 +12,7 @@
 #include <arm_neon.h>

 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-#if CONFIG_AV1_HIGHBITDEPTH
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-#endif
-#endif  // HAVE_SSE2
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@ -202,14 +202,15 @@

 SECTION .text

-;void aom_filter_block1d4_v8_sse2
+;void aom_highbd_filter_block1d4_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_v8_sse2)
 sym(aom_highbd_filter_block1d4_v8_sse2):
@ -272,14 +273,15 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
    pop         rbp
    ret

-;void aom_filter_block1d8_v8_sse2
+;void aom_highbd_filter_block1d8_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_v8_sse2)
 sym(aom_highbd_filter_block1d8_v8_sse2):
@ -331,14 +333,15 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
    pop         rbp
    ret

-;void aom_filter_block1d16_v8_sse2
+;void aom_highbd_filter_block1d16_v8_sse2
 ;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_v8_sse2)
 sym(aom_highbd_filter_block1d16_v8_sse2):
@ -394,14 +397,15 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
    pop         rbp
    ret

-;void aom_filter_block1d4_h8_sse2
+;void aom_highbd_filter_block1d4_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d4_h8_sse2)
 sym(aom_highbd_filter_block1d4_h8_sse2):
@ -469,14 +473,15 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
    pop         rbp
    ret

-;void aom_filter_block1d8_h8_sse2
+;void aom_highbd_filter_block1d8_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d8_h8_sse2)
 sym(aom_highbd_filter_block1d8_h8_sse2):
@ -535,14 +540,15 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
    pop         rbp
    ret

-;void aom_filter_block1d16_h8_sse2
+;void aom_highbd_filter_block1d16_h8_sse2
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
+;    const uint16_t  *src_ptr,
+;    const ptrdiff_t  src_pitch,
+;    uint16_t        *output_ptr,
+;    ptrdiff_t        out_pitch,
+;    unsigned int     output_height,
+;    const int16_t   *filter,
+;    int              bd
 ;)
 globalsym(aom_highbd_filter_block1d16_h8_sse2)
 sym(aom_highbd_filter_block1d16_h8_sse2):
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@ -15,6 +15,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"

 static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
@ -171,10 +172,8 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
  __m128i s0, s1, u0;
  unsigned int avg = 0;
  u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
-                          _mm_cvtsi32_si128(*(const int *)(s + p)));
-  s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
-                          _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+  s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p));
+  s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3));
  s0 = _mm_sad_epu8(s0, u0);
  s1 = _mm_sad_epu8(s1, u0);
  s0 = _mm_add_epi16(s0, s1);
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
@ -15,10 +15,9 @@

 // -----------------------------------------------------------------------------

-void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
  __m128i srcReg23_lo, srcReg34_lo;
@ -101,10 +100,9 @@ void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
  }
 }

-void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d4_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i addFilterReg64;
  __m128i secondFilters, thirdFilters;
@ -153,10 +151,9 @@ void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
  }
 }

-void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
@ -262,10 +259,9 @@ void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
  }
 }

-void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                        ptrdiff_t dst_pitch, uint32_t height,
-                                        const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d8_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i addFilterReg64;
  __m128i secondFilters, thirdFilters;
@ -330,22 +326,57 @@ void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
  }
 }

-void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                     height, filter, bd);
  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                     dst_pitch, height, filter, bd);
 }

-void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
-                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
-                                         ptrdiff_t dst_pitch, uint32_t height,
-                                         const int16_t *filter, int bd) {
+static void aom_highbd_filter_block1d16_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                     height, filter, bd);
  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                     dst_pitch, height, filter, bd);
 }
+
+// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
--- a/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
@ -551,7 +551,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                               uint32_t *res) {
  __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(~0);
+  const __m256i mask = _mm256_set1_epi64x(~0u);
  __m128i sad;

  // 8 32-bit summation
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@ -17,16 +17,7 @@
 #include "config/aom_dsp_rtcd.h"

 #include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
+#include "aom_dsp/x86/variance_impl_ssse3.h"

 static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                        const __m128i *w, const __m128i *r,
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@ -9,7 +9,7 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

-#include <tmmintrin.h>
+#include <immintrin.h>

 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@ -15,6 +15,7 @@
 #include <smmintrin.h>

 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"

 static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
                                    const int32_t *wsrc, const int32_t *mask,
@ -28,7 +29,7 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
  assert(IS_POWER_OF_TWO(h));

  do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
+    const __m128i v_p_b = xx_loadl_32(pre + n);
    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));

--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@ -22,21 +22,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/variance_impl_ssse3.h"

 ////////////////////////////////////////////////////////////////////////////////
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////

-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
                                     const int32_t *wsrc, const int32_t *mask,
                                     unsigned int *const sse, int *const sum,
--- a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@ -21,7 +21,7 @@ static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
                                                int width, int height) {
  uint64_t result;
  __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
+  const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u);
  for (int col = 0; col < height; col += 4) {
    __m256i v_acc_d = _mm256_setzero_si256();
    for (int row = 0; row < width; row += 16) {
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@ -84,7 +84,7 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
    src += stride << 2;
    r += 4;
  } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
@ -116,7 +116,7 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                int height) {
  int r = 0;

-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
  __m128i v_acc_q = _mm_setzero_si128();

  do {
@ -254,7 +254,7 @@ uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
 //////////////////////////////////////////////////////////////////////////////

 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u);
  __m128i v_acc0_q = _mm_setzero_si128();
  __m128i v_acc1_q = _mm_setzero_si128();

--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@ -12,7 +12,7 @@
 #ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
 #define AOM_AOM_DSP_X86_SYNONYMS_H_

-#include <immintrin.h>
+#include <emmintrin.h>
 #include <string.h>

 #include "config/aom_config.h"
@ -46,23 +46,13 @@ static INLINE __m128i xx_loadu_128(const void *a) {
  return _mm_loadu_si128((const __m128i *)a);
 }

-// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
-// manually on older compilers.
-#if !defined(__clang__) && __GNUC_MAJOR__ < 9
-static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
-  __m64 hi_, lo_;
-  memcpy(&hi_, hi, sizeof(hi_));
-  memcpy(&lo_, lo, sizeof(lo_));
-  return _mm_set_epi64(hi_, lo_);
-}
-#else
 // Load 64 bits from each of hi and low, and pack into an SSE register
 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
 // the strict aliasing rule, this takes a different approach
 static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
-  return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+  return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
+                            _mm_loadl_epi64((const __m128i *)hi));
 }
-#endif

 static INLINE void xx_storel_32(void *const a, const __m128i v) {
  const int val = _mm_cvtsi128_si32(v);
@ -81,28 +71,6 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
  _mm_storeu_si128((__m128i *)a, v);
 }

-// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set_epi64x()
-// acting on 32-bit integers.
-static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, e1, 0, e0);
-#else
-  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
-#endif
-}
-
-// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, a, 0, a);
-#else
-  return _mm_set1_epi64x((uint32_t)a);
-#endif
-}
-
 // Fill an SSE register using an interleaved pair of values, ie. set the
 // 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
 // as when a register is stored to / loaded from memory.
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@ -53,17 +53,6 @@ static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
  return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
 }

-// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm256_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
-#else
-  return _mm256_set1_epi64x((uint32_t)a);
-#endif
-}
-
 // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
 // therefore define an equivalent function using a different intrinsic.
 // ([ hi ], [ lo ]) -> [ hi ][ lo ]
@ -71,26 +60,11 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }

-#define GCC_VERSION (__GNUC__ * 10000 \
-                     + __GNUC_MINOR__ * 100 \
-                     + __GNUC_PATCHLEVEL__)
-
-// _mm256_loadu2_m128i has been introduced in GCC 10.1
-#if !defined(__clang__) && GCC_VERSION < 101000
-static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
-  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
-  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
-  return _mm256_set_m128i(mhi, mlo);
-}
-#else
 static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
  return yy_set_m128i(mhi, mlo);
 }
-#endif
-
-#undef GCC_VERSION

 static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@ -15,6 +15,7 @@
 #include "config/aom_dsp_rtcd.h"

 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/variance_impl_ssse3.h"

 void aom_var_filter_block2d_bil_first_pass_ssse3(
    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.h
@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
+#define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
+
+#include <stdint.h>
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+#endif  // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
--- a/third_party/aom/aom_ports/aarch32_cpudetect.c
+++ b/third_party/aom/aom_ports/aarch32_cpudetect.c
@ -44,7 +44,7 @@ static int arm_get_cpu_caps(void) {
  return flags;
 }

-#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+#elif defined(AOM_USE_ANDROID_CPU_FEATURES)

 static int arm_get_cpu_caps(void) {
  int flags = 0;
--- a/third_party/aom/aom_ports/aarch64_cpudetect.c
+++ b/third_party/aom/aom_ports/aarch64_cpudetect.c
@ -89,7 +89,7 @@ static int arm_get_cpu_caps(void) {
  return flags;
 }

-#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+#elif defined(AOM_USE_ANDROID_CPU_FEATURES)

 static int arm_get_cpu_caps(void) {
  int flags = 0;
--- a/third_party/aom/aom_ports/aom_ports.cmake
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@ -18,7 +18,7 @@ list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
            "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
            "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
            "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-            "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
+            "${AOM_ROOT}/aom_ports/sanitizer.h")

 list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")

--- a/third_party/aom/aom_ports/arm_cpudetect.h
+++ b/third_party/aom/aom_ports/arm_cpudetect.h
@ -32,7 +32,7 @@
 #endif

 #if defined(__ANDROID__) && (__ANDROID_API__ < 18)
-#define ANDROID_USE_CPU_FEATURES_LIB 1
+#define AOM_USE_ANDROID_CPU_FEATURES 1
 // Use getauxval() when targeting (64-bit) Android with API level >= 18.
 // getauxval() is supported since Android API level 18 (Android 4.3.)
 // First Android version with 64-bit support was Android 5.x (API level 21).
--- a/third_party/aom/aom_ports/bitops.h
+++ b/third_party/aom/aom_ports/bitops.h
@ -15,7 +15,6 @@
 #include <assert.h>
 #include <stdint.h>

-#include "aom_ports/msvc.h"
 #include "config/aom_config.h"

 #ifdef _MSC_VER
--- a/third_party/aom/aom_ports/msvc.h
+++ b/third_party/aom/aom_ports/msvc.h
@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_PORTS_MSVC_H_
-#define AOM_AOM_PORTS_MSVC_H_
-#ifdef _MSC_VER
-
-#include "config/aom_config.h"
-
-#if _MSC_VER < 1900  // VS2015 provides snprintf
-#define snprintf _snprintf
-#endif  // _MSC_VER < 1900
-
-#if _MSC_VER < 1800  // VS2013 provides round
-#include <math.h>
-static INLINE double round(double x) {
-  if (x < 0)
-    return ceil(x - 0.5);
-  else
-    return floor(x + 0.5);
-}
-
-static INLINE float roundf(float x) {
-  if (x < 0)
-    return (float)ceil(x - 0.5f);
-  else
-    return (float)floor(x + 0.5f);
-}
-
-static INLINE long lroundf(float x) {
-  if (x < 0)
-    return (long)(x - 0.5f);
-  else
-    return (long)(x + 0.5f);
-}
-#endif  // _MSC_VER < 1800
-
-#if HAVE_AVX
-#include <immintrin.h>
-// Note:
-// _mm256_insert_epi16 intrinsics is available from vs2017.
-// We define this macro for vs2015 and earlier. The
-// intrinsics used here are in vs2015 document:
-// https://msdn.microsoft.com/en-us/library/hh977022.aspx
-// Input parameters:
-// a: __m256i,
-// d: int16_t,
-// indx: imm8 (0 - 15)
-#if _MSC_VER <= 1900
-#define _mm256_insert_epi16(a, d, indx)                                      \
-  _mm256_insertf128_si256(                                                   \
-      a,                                                                     \
-      _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
-      indx >> 3)
-
-static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
-  return a.m256i_i32[i & 7];
-}
-static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
-  __m256i c = a;
-  c.m256i_i32[i & 7] = b;
-  return c;
-}
-#endif  // _MSC_VER <= 1900
-#endif  // HAVE_AVX
-#endif  // _MSC_VER
-#endif  // AOM_AOM_PORTS_MSVC_H_
--- a/third_party/aom/aom_util/aom_pthread.h
+++ b/third_party/aom/aom_util/aom_pthread.h
@ -36,8 +36,6 @@ typedef HANDLE pthread_t;
 typedef int pthread_attr_t;
 typedef CRITICAL_SECTION pthread_mutex_t;

-#include <errno.h>
-
 #if _WIN32_WINNT < 0x0600
 #error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
 #endif
@ -74,6 +72,20 @@ static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
  return 0;
 }

+static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr,
+                                            size_t *stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
+static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr,
+                                            size_t stacksize) {
+  (void)attr;
+  (void)stacksize;
+  return EINVAL;
+}
+
 static INLINE int pthread_create(pthread_t *const thread,
                                 const pthread_attr_t *attr,
                                 unsigned int(__stdcall *start)(void *),
--- a/third_party/aom/aom_util/aom_thread.c
+++ b/third_party/aom/aom_util/aom_thread.c
@ -156,16 +156,18 @@ static int reset(AVxWorker *const worker) {
      // See: https://crbug.com/aomedia/3379
 #if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
    !defined(NDEBUG)
+    const size_t kMinStackSize = 1024 * 1024;
+#else
+    const size_t kMinStackSize = 256 * 1024;
+#endif
    size_t stacksize;
    if (!pthread_attr_getstacksize(&attr, &stacksize)) {
-      const size_t kMinStackSize = 1 << 20;  // 1 MiB
      if (stacksize < kMinStackSize &&
          pthread_attr_setstacksize(&attr, kMinStackSize)) {
        pthread_attr_destroy(&attr);
        goto Error2;
      }
    }
-#endif
    pthread_mutex_lock(&worker->impl_->mutex_);
    ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
    if (ok) worker->status_ = AVX_WORKER_STATUS_OK;
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@ -266,6 +266,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
            "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
            "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
            "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/resize_sse2.c"
            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")

 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
@ -354,35 +355,36 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
            "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h"
-            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/ml_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h"
+            "${AOM_ROOT}/av1/encoder/arm/quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
-            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
+            "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
+            "${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c"
+            "${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c"
+            "${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
-            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
+            "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c")

 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c"
            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
            "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
@ -414,6 +416,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SVE
            "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c"
            "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")

+list(APPEND AOM_AV1_COMMON_INTRIN_SVE2
+            "${AOM_ROOT}/av1/common/arm/convolve_sve2.c")
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
            "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")

@ -452,7 +457,7 @@ if(CONFIG_AV1_TEMPORAL_DENOISING)
              "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")

  list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
+              "${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c")
 endif()

 if(CONFIG_AV1_HIGHBITDEPTH)
@ -499,9 +504,12 @@ if(CONFIG_AV1_HIGHBITDEPTH)
              "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")

  list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
+              "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
+              "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c")
 endif()

 if(CONFIG_ACCOUNTING)
@ -527,7 +535,7 @@ if(CONFIG_REALTIME_ONLY)
                   "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")

  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
-                   "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c")
+                   "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c")

  list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
                   "${AOM_ROOT}/av1/encoder/cnn.c"
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@ -674,6 +674,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
  RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);

+  RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000);
  RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
  RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
  RANGE_CHECK_BOOL(extra_cfg, lossless);
@ -1034,39 +1035,22 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
  }

  TuneCfg *const tune_cfg = &oxcf->tune_cfg;
-
  FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
-
  TileConfig *const tile_cfg = &oxcf->tile_cfg;
-
  ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
-
  GFConfig *const gf_cfg = &oxcf->gf_cfg;
-
  PartitionCfg *const part_cfg = &oxcf->part_cfg;
-
  IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
-
  TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
-
  CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
-
  SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
-
  KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
-
  DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
-
  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-
  QuantizationCfg *const q_cfg = &oxcf->q_cfg;
-
  ColorCfg *const color_cfg = &oxcf->color_cfg;
-
  InputCfg *const input_cfg = &oxcf->input_cfg;
-
  AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
-
  ToolCfg *const tool_cfg = &oxcf->tool_cfg;

  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
@ -1610,37 +1594,42 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
  return AOM_CODEC_OK;
 }

+static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) {
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
+  bool is_sb_size_changed = false;
+  av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+  for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+    AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
+    struct aom_internal_error_info *const error = cpi->common.error;
+    if (setjmp(error->jmp)) {
+      error->setjmp = 0;
+      return error->error_code;
+    }
+    error->setjmp = 1;
+    av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
+    error->setjmp = 0;
+  }
+  if (ctx->ppi->cpi_lap != NULL) {
+    AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
+    struct aom_internal_error_info *const error = cpi_lap->common.error;
+    if (setjmp(error->jmp)) {
+      error->setjmp = 0;
+      return error->error_code;
+    }
+    error->setjmp = 1;
+    av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
+    error->setjmp = 0;
+  }
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
                                        const struct av1_extracfg *extra_cfg) {
  const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
  if (res == AOM_CODEC_OK) {
    ctx->extra_cfg = *extra_cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
-    bool is_sb_size_changed = false;
-    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
-    for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
-      AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
-      struct aom_internal_error_info *const error = cpi->common.error;
-      if (setjmp(error->jmp)) {
-        error->setjmp = 0;
-        return error->error_code;
-      }
-      error->setjmp = 1;
-      av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
-      error->setjmp = 0;
-    }
-    if (ctx->ppi->cpi_lap != NULL) {
-      AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
-      struct aom_internal_error_info *const error = cpi_lap->common.error;
-      if (setjmp(error->jmp)) {
-        error->setjmp = 0;
-        return error->error_code;
-      }
-      error->setjmp = 1;
-      av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
-      error->setjmp = 0;
-    }
+    return update_encoder_cfg(ctx);
  }
  return res;
 }
@ -3343,7 +3332,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
      if (ppi->cpi->oxcf.pass != 1) {
        ppi->total_time_compress_data += cpi->time_compress_data;
        ppi->total_recode_hits += cpi->frame_recode_hits;
-        ppi->total_bytes += cpi->bytes;
+        ppi->total_bytes += (uint64_t)cpi->bytes;
        for (int i = 0; i < MAX_MODES; i++) {
          ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
        }
@ -3611,11 +3600,23 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
  aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);

  if (mode) {
-    const int res = av1_set_internal_size(
-        &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
-        mode->h_scaling_mode, mode->v_scaling_mode);
-    av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
-    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+    AV1EncoderConfig *const oxcf =
+        ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf;
+    const int res =
+        av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params,
+                              mode->h_scaling_mode, mode->v_scaling_mode);
+    if (res == 0) {
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      if (ctx->ppi->seq_params_locked) {
+        av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+        return AOM_CODEC_OK;
+      }
+      return update_encoder_cfg(ctx);
+    }
+    return AOM_CODEC_INVALID_PARAM;
  } else {
    return AOM_CODEC_INVALID_PARAM;
  }
@ -3636,6 +3637,13 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
  if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
    return AOM_CODEC_INVALID_PARAM;
  ctx->ppi->number_spatial_layers = number_spatial_layers;
+  // update_encoder_cfg() is somewhat costly and this control may be called
+  // multiple times, so update_encoder_cfg() is only called to ensure frame and
+  // superblock sizes are updated before they're fixed by the first encode
+  // call.
+  if (!ctx->ppi->seq_params_locked) {
+    return update_encoder_cfg(ctx);
+  }
  return AOM_CODEC_OK;
 }

@ -3653,8 +3661,6 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
  AV1_PRIMARY *const ppi = ctx->ppi;
  AV1_COMP *const cpi = ppi->cpi;
-  AV1_COMMON *const cm = &cpi->common;
-  AV1EncoderConfig *oxcf = &cpi->oxcf;
  aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
  int64_t target_bandwidth = 0;
  ppi->number_spatial_layers = params->number_spatial_layers;
@ -3694,19 +3700,38 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
          target_bandwidth += lc->layer_target_bitrate;
      }
    }
-    if (cm->current_frame.frame_number == 0) {
-      if (!cpi->ppi->seq_params_locked) {
-        SequenceHeader *const seq_params = &ppi->seq_params;
-        seq_params->operating_points_cnt_minus_1 =
-            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
-        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
-      }
+
+    if (ppi->seq_params_locked) {
+      AV1EncoderConfig *const oxcf = &cpi->oxcf;
+      // Keep ctx->oxcf in sync in case further codec controls are made prior
+      // to encoding.
+      ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth =
+          target_bandwidth;
+      set_primary_rc_buffer_sizes(oxcf, ppi);
+      av1_update_layer_context_change_config(cpi, target_bandwidth);
+      check_reset_rc_flag(cpi);
+    } else {
+      // Note av1_init_layer_context() relies on cpi->oxcf. The order of that
+      // call and the ones in the other half of this block (which
+      // update_encoder_cfg() transitively makes) is important. So we keep
+      // ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will
+      // overwrite cpi->oxcf with ctx->oxcf.
+      ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth =
+          target_bandwidth;
+      SequenceHeader *const seq_params = &ppi->seq_params;
+      seq_params->operating_points_cnt_minus_1 =
+          ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+
      av1_init_layer_context(cpi);
+      // update_encoder_cfg() is somewhat costly and this control may be called
+      // multiple times, so update_encoder_cfg() is only called to ensure frame
+      // and superblock sizes are updated before they're fixed by the first
+      // encode call.
+      return update_encoder_cfg(ctx);
    }
-    oxcf->rc_cfg.target_bandwidth = target_bandwidth;
-    set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
-    av1_update_layer_context_change_config(cpi, target_bandwidth);
-    check_reset_rc_flag(cpi);
+  } else if (!ppi->seq_params_locked) {
+    // Ensure frame and superblock sizes are updated.
+    return update_encoder_cfg(ctx);
  }
  av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
  return AOM_CODEC_OK;
--- a/third_party/aom/av1/common/arm/av1_convolve_scale_neon.c
+++ b/third_party/aom/av1/common/arm/av1_convolve_scale_neon.c
@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t compound_convolve8_4_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE int16x8_t compound_convolve8_8_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+  return vcombine_s16(res0, res1);
+}
+
+static INLINE void compound_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      vst1_u16(dst, vreinterpret_u16_s16(d0));
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint16_t *d = dst;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        vst1q_u16(d, vreinterpretq_u16_s16(d0));
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_avg_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts
+  // on modern CPUs.
+  const int32_t vert_offset_bits =
+      (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1));
+  // For the averaging code path substract round offset and convolve round.
+  const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits);
+  const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits);
+
+  int y_qn = subpel_y_qn;
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int16x4_t avg = vhadd_s16(dd0, d0);
+      int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0));
+
+      uint8x8_t d0_u8 = vqrshrun_n_s16(
+          d0_s16, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int16x8_t avg = vhaddq_s16(dd0, d0);
+
+        uint8x8_t d0_u8 = vqrshrun_n_s16(
+            avg, (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void compound_dist_wtd_convolve_vert_scale_neon(
+    const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
+    uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter,
+    ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  int y_qn = subpel_y_qn;
+  // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use
+  // non-rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs.
+  const int32x4_t vert_offset =
+      vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)));
+  // For the weighted averaging code path we have to substract round offset and
+  // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS -
+  // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The
+  // additional shift by DIST_PRECISION_BITS is needed in order to merge two
+  // shift calculations into one.
+  const int32x4_t dist_wtd_offset = vdupq_n_s32(
+      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 +
+             DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)));
+  const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset);
+  const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset);
+
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            filter, vert_offset);
+
+      int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16));
+
+      int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0);
+      dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0);
+
+      int16x4_t d0_s16 = vshrn_n_s32(
+          dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                           DIST_PRECISION_BITS);
+
+      uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0)));
+
+      store_u8_4x1(dst8, d0_u8);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int width = w;
+      uint8_t *dst8_ptr = dst8;
+      uint16_t *dst16_ptr = dst16;
+
+      do {
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              filter, vert_offset);
+
+        int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr));
+
+        int32x4_t dst_wtd_avg0 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0));
+        int32x4_t dst_wtd_avg1 =
+            vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0));
+
+        dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0));
+        dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0));
+
+        int16x4_t d0_s16_0 = vshrn_n_s32(
+            dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+        int16x4_t d0_s16_1 = vshrn_n_s32(
+            dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS +
+                              DIST_PRECISION_BITS);
+
+        uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1));
+
+        vst1_u8(dst8_ptr, d0_u8);
+
+        s += 8;
+        dst8_ptr += 8;
+        dst16_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst16 += dst16_stride;
+      dst8 += dst8_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0)));
+}
+
+static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t offset_const) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS);
+  int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res0, res1));
+}
+
+static INLINE void convolve_vert_scale_neon(const int16_t *src, int src_stride,
+                                            uint8_t *dst, int dst_stride, int w,
+                                            int h, const int16_t *y_filter,
+                                            int subpel_y_qn, int y_step_qn) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_1 = 2 * FILTER_BITS - ROUND0_BITS;
+  // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts.
+  int32x4_t vert_offset =
+      vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1)));
+
+  int y_qn = subpel_y_qn;
+  if (w == 4) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      store_u8_4x1(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else if (w == 8) {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d =
+          convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset);
+
+      vst1_u8(dst, d);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  } else {
+    do {
+      const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+      uint8_t *d = dst;
+      int width = w;
+
+      const ptrdiff_t filter_offset =
+          SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+      const int16x8_t filter = vld1q_s16(y_filter + filter_offset);
+
+      do {
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
+                     &s5[0], &s6[0], &s7[0]);
+        load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
+                     &s5[1], &s6[1], &s7[1]);
+
+        uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                     s6[0], s7[0], filter, vert_offset);
+        uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                     s6[1], s7[1], filter, vert_offset);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+
+      dst += dst_stride;
+      y_qn += y_step_qn;
+    } while (--h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter,
+                                      const int32x4_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int32x4_t sum = horiz_const;
+  sum = vmlal_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_hi, 3);
+
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int16x8_t horiz_const) {
+  int16x4_t filter_lo = vget_low_s16(filter);
+  int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void convolve_horiz_scale_neon(const uint8_t *src, int src_stride,
+                                             int16_t *dst, int dst_stride,
+                                             int w, int h,
+                                             const int16_t *x_filter,
+                                             const int subpel_x_qn,
+                                             const int x_step_qn) {
+  DECLARE_ALIGNED(16, int16_t, temp[8 * 8]);
+  const int bd = 8;
+
+  if (w == 4) {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS];
+
+        const ptrdiff_t filter_offset =
+            SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+        const int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+
+        uint8x8_t t0, t1, t2, t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+        int16x4_t d0 =
+            convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+
+        vst1_s16(&temp[r * 4], d0);
+        x_qn += x_step_qn;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      int16x4_t d0, d1, d2, d3;
+      load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+
+    do {
+      int x_qn = subpel_x_qn;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        // Process an 8x8 tile.
+        for (int r = 0; r < 8; ++r) {
+          const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)];
+
+          const ptrdiff_t filter_offset =
+              SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+          int16x8_t filter = vld1q_s16(x_filter + filter_offset);
+          // Filter values are all even so halve them to allow convolution
+          // kernel computations to stay in 16-bit element types.
+          filter = vshrq_n_s16(filter, 1);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                                 &t3, &t4, &t5, &t6, &t7);
+
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter,
+                                       horiz_offset);
+
+          vst1q_s16(&temp[r * 8], d0);
+
+          x_qn += x_step_qn;
+        }
+
+        // Transpose the 8x8 result tile and store.
+        int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+        load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int x_step_qn,
+                                const int subpel_y_qn, const int y_step_qn,
+                                ConvolveParams *conv_params) {
+  if (w < 4 || h < 4) {
+    av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, filter_params_y, subpel_x_qn,
+                            x_step_qn, subpel_y_qn, y_step_qn, conv_params);
+    return;
+  }
+
+  // For the interpolation 8-tap filters are used.
+  assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8);
+
+  DECLARE_ALIGNED(32, int16_t,
+                  im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
+
+  // Horizontal filter
+  convolve_horiz_scale_neon(
+      src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+      im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  // Vertical filter
+  if (UNLIKELY(conv_params->is_compound)) {
+    if (conv_params->do_average) {
+      if (conv_params->use_dist_wtd_comp_avg) {
+        compound_dist_wtd_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn);
+      } else {
+        compound_avg_convolve_vert_scale_neon(
+            im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h,
+            filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+      }
+    } else {
+      compound_convolve_vert_scale_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h,
+          filter_params_y->filter_ptr, subpel_y_qn, y_step_qn);
+    }
+  } else {
+    convolve_vert_scale_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             filter_params_y->filter_ptr, subpel_y_qn,
+                             y_step_qn);
+  }
+}
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@ -447,7 +447,7 @@ static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
  out[7] = step1;
 }

-void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
  assert(!(size % 4));
  if (!bit) return;
  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
@ -3661,7 +3661,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
      round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
    }
    row_txfm(cur_a, cur_a, INV_COS_BIT);
-    av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+    round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
    if (lr_flip == 1) {
      for (int j = 0; j < buf_size_w_div8; ++j) {
        flip_buf_ud_neon(&cur_a[j * 8], 8);
@ -3736,8 +3736,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
  }
  for (int j = 0; j < buf_size_w_div8; ++j) {
    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
  }
  if (txfm_size_col >= 16) {
    for (int i = 0; i < (txfm_size_col >> 4); i++) {
@ -3814,8 +3813,9 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
  }
 }

-void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                          int stride, TX_TYPE tx_type,
+                                          int eob) {
  (void)eob;
  TX_SIZE tx_size = TX_4X8;
  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
@ -3879,8 +3879,9 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
  }
 }

-void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                          int stride, TX_TYPE tx_type,
+                                          int eob) {
  (void)eob;
  TX_SIZE tx_size = TX_8X4;
  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
@ -3944,8 +3945,9 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
  }
 }

-void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, int eob) {
  (void)eob;
  TX_SIZE tx_size = TX_4X16;
  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
@ -4008,8 +4010,9 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
  }
 }

-void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
+static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, int eob) {
  (void)eob;
  TX_SIZE tx_size = TX_16X4;
  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
@ -4112,7 +4115,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
      round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
    }
    row_txfm(cur_a, cur_a, INV_COS_BIT);
-    av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+    round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
    if (lr_flip == 1) {
      for (int j = 0; j < buf_size_w_div8; ++j) {
        flip_buf_ud_neon(&cur_a[j * 8], 8);
@ -4130,8 +4133,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
  }
  for (int j = 0; j < buf_size_w_div8; ++j) {
    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
  }

  if (txfm_size_col >= 16) {
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@ -188,18 +188,95 @@ static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
 #endif  // AOM_ARCH_AARCH64
 }

-static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
-                                      const int16x4_t s2, const int16x4_t s3,
+static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
                                      const int16x4_t filter,
-                                      const int16x4_t horiz_const) {
-  int16x4_t sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
+                                      int16x8_t horiz_const) {
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}

-  // We halved the convolution filter values so - 1 from the right shift.
-  return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
+static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
+                                           int src_stride, uint8_t *dst_ptr,
+                                           const int dst_stride, int w, int h,
+                                           const int16_t *x_filter_ptr) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+  // rounding right shift by FILTER_BITS - instead of a first rounding right
+  // shift by ROUND0_BITS, followed by second rounding right shift by
+  // FILTER_BITS - ROUND0_BITS.
+  // The outermost -1 is needed because we will halve the filter values.
+  const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
+
+  if (w == 4) {
+    do {
+      uint8x8_t t01[4];
+      t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
+      t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
+      t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
+      t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
+
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
+
+      uint8x8_t d01 =
+          convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      do {
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
+
+        int16x8_t s0[4], s1[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
+
+        uint8x8_t d0 =
+            convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+        uint8x8_t d1 =
+            convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
+
+        store_u8_8x2(d, dst_stride, d0, d1);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  }
 }

 static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
@ -242,12 +319,20 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
      filter_params_x, subpel_x_qn & SUBPEL_MASK);

-  if (filter_params_x->taps > 8) {
+  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_taps > 8) {
    convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
                             x_filter_ptr);
    return;
  }

+  if (filter_taps <= 4) {
+    convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
+                            x_filter_ptr);
+    return;
+  }
+
  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
  // rounding right shift by FILTER_BITS - instead of a first rounding right
  // shift by ROUND0_BITS, followed by second rounding right shift by
@ -255,149 +340,220 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  // The outermost -1 is needed because we will halve the filter values.
  const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));

-  if (w <= 4) {
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
-
-    src += 2;
-
-    do {
-      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-
-      uint8x8_t d0 =
-          convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
-
-      store_u8_4x1(dst, d0);
-
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    // Filter values are even so halve to reduce precision requirements.
-    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // Filter values are even so halve to reduce precision requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);

 #if AOM_ARCH_AARCH64
-    while (h >= 8) {
-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+  while (h >= 8) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+    int width = w;
+    const uint8_t *s = src + 7;
+    uint8_t *d = dst;
+
+    __builtin_prefetch(d + 0 * dst_stride);
+    __builtin_prefetch(d + 1 * dst_stride);
+    __builtin_prefetch(d + 2 * dst_stride);
+    __builtin_prefetch(d + 3 * dst_stride);
+    __builtin_prefetch(d + 4 * dst_stride);
+    __builtin_prefetch(d + 5 * dst_stride);
+    __builtin_prefetch(d + 6 * dst_stride);
+    __builtin_prefetch(d + 7 * dst_stride);
+
+    do {
+      uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+      load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+      transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
+                                     &t14);
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+
+      uint8x8_t d0 =
+          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
+      uint8x8_t d1 =
+          convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
+      uint8x8_t d2 =
+          convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
+      uint8x8_t d3 =
+          convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
+      uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   horiz_const);
+      uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   horiz_const);
+      uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   horiz_const);
+      uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, horiz_const);
+
+      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+      s0 = s8;
+      s1 = s9;
+      s2 = s10;
+      s3 = s11;
+      s4 = s12;
+      s5 = s13;
+      s6 = s14;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  while (h-- != 0) {
+    uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+    int width = w;
+    const uint8_t *s = src + 8;
+    uint8_t *d = dst;
+
+    __builtin_prefetch(d);
+
+    do {
+      uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+
+      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+      uint8x8_t d0 =
+          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
+
+      vst1_u8(d, d0);
+
+      s0 = s8;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
+                                           const int src_stride, uint8_t *dst,
+                                           const int dst_stride, int w, int h,
+                                           const int16_t *filter_y) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);

-      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));

-      int width = w;
-      const uint8_t *s = src + 7;
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
      uint8_t *d = dst;

-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-      __builtin_prefetch(d + 4 * dst_stride);
-      __builtin_prefetch(d + 5 * dst_stride);
-      __builtin_prefetch(d + 6 * dst_stride);
-      __builtin_prefetch(d + 7 * dst_stride);
-
      do {
-        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);

-        transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
-                                       &t14);
-        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
-        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
-        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
-        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
-        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
-        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
-        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));

-        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                     horiz_const);
-        uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                     horiz_const);
-        uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                     horiz_const);
-        uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                     horiz_const);
-        uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                                     horiz_const);
-        uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
-                                     x_filter, horiz_const);
-        uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
-                                     x_filter, horiz_const);
-        uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
-                                     x_filter, horiz_const);
+        uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);

-        transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);

-        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;

-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 8 * src_stride;
-      dst += 8 * dst_stride;
-      h -= 8;
-    }
-#endif  // AOM_ARCH_AARCH64
-
-    while (h-- != 0) {
-      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-      int width = w;
-      const uint8_t *s = src + 8;
-      uint8_t *d = dst;
-
-      __builtin_prefetch(d);
-
-      do {
-        uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
-
-        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                     horiz_const);
-
-        vst1_u8(d, d0);
-
-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += src_stride;
-      dst += dst_stride;
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
  }
 }

@ -974,7 +1130,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  }

  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
  const int vert_offset = clamped_y_taps / 2 - 1;

  src -= vert_offset * src_stride;
@ -991,7 +1147,10 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  // Filter values are even so halve to reduce precision requirements.
  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);

-  if (y_filter_taps < 8) {
+  if (y_filter_taps <= 4) {
+    convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
+                            y_filter_ptr);
+  } else if (y_filter_taps == 6) {
    convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
  } else {
    convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
@ -1148,18 +1307,122 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon(
  } while (--h != 0);
 }

-static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
-                                         const int16x4_t s2, const int16x4_t s3,
+static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
                                         const int16x4_t filter,
-                                         const int16x4_t horiz_const) {
-  int16x4_t sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter, 0);
-  sum = vmla_lane_s16(sum, s1, filter, 1);
-  sum = vmla_lane_s16(sum, s2, filter, 2);
-  sum = vmla_lane_s16(sum, s3, filter, 3);
+                                         const int16x8_t horiz_const) {
+  int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+  // We halved the filter values so -1 from right shift.
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}

-  // We halved the convolution filter values so -1 from the right shift.
-  return vshr_n_s16(sum, ROUND0_BITS - 1);
+static INLINE void convolve_2d_sr_horiz_4tap_neon(
+    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
+  const int bd = 8;
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
+
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+
+  if (w == 4) {
+    do {
+      uint8x8_t t01[4];
+      t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
+      t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
+      t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
+      t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
+
+      int16x8_t s01[4];
+      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
+      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
+      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
+      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
+
+      int16x8_t d01 =
+          convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
+
+      store_s16x4_strided_x2(dst, (int)dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      int16_t *d = dst;
+
+      do {
+        uint8x8_t t0[4], t1[4];
+        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
+
+        int16x8_t s0[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        int16x8_t s1[4];
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
+        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
+        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
+
+        int16x8_t d0 =
+            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+        int16x8_t d1 =
+            convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
+
+        store_s16_8x2(d, dst_stride, d0, d1);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 2);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t t0[4];
+        load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
+
+        int16x8_t s0[4];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
+        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
+        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
+
+        int16x8_t d0 =
+            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
+
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
 }

 static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
@ -1185,10 +1448,9 @@ static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
  return vshrq_n_s16(sum, ROUND0_BITS - 1);
 }

-static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
-                                             int16_t *im_block, int im_stride,
-                                             int w, int im_h,
-                                             const int16_t *x_filter_ptr) {
+static INLINE void convolve_2d_sr_horiz_8tap_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16_t *x_filter_ptr) {
  const int bd = 8;

  const uint8_t *src_ptr = src;
@ -1196,149 +1458,119 @@ static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
  int dst_stride = im_stride;
  int height = im_h;

-  if (w <= 4) {
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                             (1 << ((ROUND0_BITS - 1) - 1)));
-    // 4-tap filters are used for blocks having width <= 4.
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
-
-    src_ptr += 2;
-
-    do {
-      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-
-      int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
-
-      vst1_s16(dst_ptr, d0);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
-  } else {
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                              (1 << ((ROUND0_BITS - 1) - 1)));
-    // Filter values are even, so halve to reduce intermediate precision reqs.
-    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);

 #if AOM_ARCH_AARCH64
-    while (height > 8) {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+  while (height > 8) {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;

-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);

-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));

-      s += 7;
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        x_filter, horiz_const);
-        int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        x_filter, horiz_const);
-        int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        x_filter, horiz_const);
-        int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        x_filter, horiz_const);
-        int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
-                                        x_filter, horiz_const);
-        int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
-                                        x_filter, horiz_const);
-        int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
-                                        x_filter, horiz_const);
-        int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
-                                        x_filter, horiz_const);
-
-        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      height -= 8;
-    }
-#endif  // AOM_ARCH_AARCH64
+    s += 7;

    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);

-      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
-      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);

-      do {
-        uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
-        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));

-        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      horiz_const);
+      int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                      horiz_const);
+      int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                      horiz_const);
+      int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                      horiz_const);
+      int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      x_filter, horiz_const);
+      int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      x_filter, horiz_const);
+      int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      x_filter, horiz_const);
+      int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      x_filter, horiz_const);

-        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        x_filter, horiz_const);
+      transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);

-        vst1q_s16(d, d0);
+      store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);

-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--height != 0);
+      s0 = s8;
+      s1 = s9;
+      s2 = s10;
+      s3 = s11;
+      s4 = s12;
+      s5 = s13;
+      s6 = s14;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 8 * src_stride;
+    dst_ptr += 8 * dst_stride;
+    height -= 8;
  }
+#endif  // AOM_ARCH_AARCH64
+
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+    do {
+      uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      horiz_const);
+
+      vst1q_s16(d, d0);
+
+      s0 = s8;
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
 }

 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@ -1355,7 +1587,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  }

  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
  const int im_h = h + clamped_y_taps - 1;
  const int im_stride = MAX_SB_SIZE;
  const int vert_offset = clamped_y_taps / 2 - 1;
@ -1385,12 +1618,20 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    DECLARE_ALIGNED(16, int16_t,
                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);

-    convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
-                              x_filter_ptr);
+    if (x_filter_taps <= 4) {
+      convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
+                                     im_stride, w, im_h, x_filter_ptr);
+    } else {
+      convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
+                                     w, im_h, x_filter_ptr);
+    }

    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);

-    if (clamped_y_taps <= 6) {
+    if (clamped_y_taps <= 4) {
+      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_ptr);
+    } else if (clamped_y_taps == 6) {
      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                    y_filter);
    } else {
--- a/third_party/aom/av1/common/arm/convolve_neon.h
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@ -535,4 +535,112 @@ static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
  }
 }

+static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t y_filter) {
+  int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter, 3);
+
+  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x4_t y_filter,
+                                         const int16x8_t sub_const) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
+
+  int16x8_t res =
+      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+  res = vsubq_s16(res, sub_const);
+
+  return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
+                                                 int src_stride,
+                                                 uint8_t *dst_ptr,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *y_filter) {
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  const int16x4_t filter = vld1_s16(y_filter + 2);
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
+    src_ptr += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
+
+      int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
+      int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
+      int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
+      int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    // Width is a multiple of 8 and height is a multiple of 4.
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
+        uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
+        uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
+        uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
--- a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
+++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
--- a/third_party/aom/av1/common/arm/convolve_neon_i8mm.c
+++ b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c
--- a/third_party/aom/av1/common/arm/convolve_neon_i8mm.h
+++ b/third_party/aom/av1/common/arm/convolve_neon_i8mm.h
@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
+  sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1);
+  sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2);
+
+  // Narrow and re-pack.
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
+                                 vqtbl1q_u8(samples[0], permute_tbl.val[2]),
+                                 vqtbl1q_u8(samples[1], permute_tbl.val[2]) };
+
+  int32x4_t sum0123 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
+  sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
+
+  int32x4_t sum4567 =
+      vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
+  sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
+
+  // Narrow and re-pack.
+  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
+                      vshrn_n_s32(sum4567, ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
+  const int bd = 8;
+
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+  // - which are generally faster than rounding shifts on modern CPUs.
+  const int32x4_t horiz_const =
+      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 =
+            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 =
+            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 =
+            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
--- a/third_party/aom/av1/common/arm/convolve_sve2.c
+++ b/third_party/aom/av1/common/arm/convolve_sve2.c
@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_filter.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/highbd_convolve_sve2.h"
+#include "av1/common/arm/convolve_neon_i8mm.h"
+
+static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2],
+                                                 int16x8_t s1[2],
+                                                 int16x8_t s2[2],
+                                                 int16x8_t filter_0_7,
+                                                 int16x8_t filter_4_11) {
+  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
+  sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
+  sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
+
+  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
+  sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
+  sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
+
+  return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+}
+
+static INLINE void convolve_2d_sr_vert_12tap_sve2(
+    const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t y_filter_0_7,
+    const int16x8_t y_filter_4_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  uint16x8_t correction0 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
+  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
+
+  uint16x8_t correction1 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
+
+  uint16x8_t correction2 =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
+  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
+
+  do {
+    int16_t *s = (int16_t *)src_ptr;
+    uint8_t *d = (uint8_t *)dst_ptr;
+    int height = h;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                  &s9, &sA);
+    s += 11 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
+        s6789[2], s789A[2];
+    // This operation combines a conventional transpose and the sample permute
+    // required before computing the dot product.
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+    do {
+      int16x4_t sB, sC, sD, sE;
+      load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+      int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
+      transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
+
+      // Merge new data into block from previous iteration.
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
+      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
+
+      int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7,
+                                              y_filter_4_11);
+      int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7,
+                                              y_filter_4_11);
+
+      int16x8_t dd01 =
+          vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
+      int16x8_t dd23 =
+          vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+
+      dd01 = vsubq_s16(dd01, sub_const);
+      dd23 = vsubq_s16(dd23, sub_const);
+
+      uint8x8_t d01 = vqmovun_s16(dd01);
+      uint8x8_t d23 = vqmovun_s16(dd23);
+
+      store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+      s4567[0] = s89AB[0];
+      s4567[1] = s89AB[1];
+      s5678[0] = s9ABC[0];
+      s5678[1] = s9ABC[1];
+      s6789[0] = sABCD[0];
+      s6789[1] = sABCD[1];
+      s789A[0] = sBCDE[0];
+      s789A[1] = sBCDE[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    w -= 4;
+  } while (w != 0);
+}
+
+void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+
+  if (filter_params_x->taps > 8) {
+    const int im_h = h + filter_params_y->taps - 1;
+    const int im_stride = MAX_SB_SIZE;
+    const int vert_offset = filter_params_x->taps / 2 - 1;
+    const int horiz_offset = filter_params_x->taps / 2 - 1;
+    const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+    const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+    const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
+
+    convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                         im_stride, w, im_h, x_filter_0_7,
+                                         x_filter_8_11);
+
+    convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h,
+                                   y_filter_0_7, y_filter_4_11);
+  } else {
+    av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, filter_params_y, subpel_x_qn,
+                                 subpel_y_qn, conv_params);
+  }
+}
--- a/third_party/aom/av1/common/arm/highbd_convolve_sve2.c
+++ b/third_party/aom/av1/common/arm/highbd_convolve_sve2.c
@ -562,11 +562,12 @@ static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
  return vminq_u16(res, max);
 }

-void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
-                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                    int width, int height,
-                                    const int16_t *filter_y, int bd) {
-  assert(w >= 4 && h >= 4);
+static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
+                                           ptrdiff_t src_stride, uint16_t *dst,
+                                           ptrdiff_t dst_stride, int width,
+                                           int height, const int16_t *filter_y,
+                                           int bd) {
+  assert(width >= 4 && height >= 4);

  const int16x8_t y_filter = vld1q_s16(filter_y);

@ -731,11 +732,12 @@ static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
  return vminq_u16(res, max);
 }

-void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
-                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                    int width, int height,
-                                    const int16_t *filter_y, int bd) {
-  assert(w >= 4 && h >= 4);
+static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
+                                           ptrdiff_t src_stride, uint16_t *dst,
+                                           ptrdiff_t dst_stride, int width,
+                                           int height, const int16_t *filter_y,
+                                           int bd) {
+  assert(width >= 4 && height >= 4);

  const int16x8_t y_filter =
      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
@ -1346,13 +1348,11 @@ static INLINE uint16x8_t highbd_convolve8_8_2d_v(
  return vminq_u16(res, max);
 }

-void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
-                                          ptrdiff_t src_stride, uint16_t *dst,
-                                          ptrdiff_t dst_stride, int width,
-                                          int height, const int16_t *filter_y,
-                                          ConvolveParams *conv_params, int bd,
-                                          const int y_offset) {
-  assert(w >= 4 && h >= 4);
+static void highbd_convolve_2d_sr_vert_8tap_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
+    ConvolveParams *conv_params, int bd, const int y_offset) {
+  assert(width >= 4 && height >= 4);
  const int64x2_t offset = vdupq_n_s64(y_offset);
  const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
  const int16x8_t y_filter = vld1q_s16(filter_y);
@ -1536,13 +1536,11 @@ static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
  return vminq_u16(res, max);
 }

-void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
-                                          ptrdiff_t src_stride, uint16_t *dst,
-                                          ptrdiff_t dst_stride, int width,
-                                          int height, const int16_t *filter_y,
-                                          ConvolveParams *conv_params, int bd,
-                                          const int y_offset) {
-  assert(w >= 4 && h >= 4);
+static void highbd_convolve_2d_sr_vert_4tap_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
+    ConvolveParams *conv_params, int bd, const int y_offset) {
+  assert(width >= 4 && height >= 4);
  const int64x2_t offset = vdupq_n_s64(y_offset);
  const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);

--- a/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
+++ b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
@ -13,6 +13,7 @@
 #include <assert.h>

 #include "aom_dsp/arm/sum_neon.h"
+#include "config/av1_rtcd.h"

 #define MAX_UPSAMPLE_SZ 16

--- a/third_party/aom/av1/common/arm/reconintra_neon.c
+++ b/third_party/aom/av1/common/arm/reconintra_neon.c
@ -13,6 +13,7 @@
 #include <assert.h>

 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
--- a/third_party/aom/av1/common/arm/resize_neon.c
+++ b/third_party/aom/av1/common/arm/resize_neon.c
@ -16,6 +16,7 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
 #include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"

 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@ -1124,10 +1124,10 @@ static void final_filter_fast_internal(uint16_t *A, int32_t *B,
  } while (h > 0);
 }

-void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
-                           int16_t *src, const int src_stride, int32_t *dst,
-                           const int dst_stride, const int width,
-                           const int height) {
+static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                                  int16_t *src, const int src_stride,
+                                  int32_t *dst, const int dst_stride,
+                                  const int width, const int height) {
  int16x8_t s0;
  int32_t *B_tmp, *dst_ptr;
  uint16_t *A_tmp;
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@ -470,7 +470,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
      add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
      specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
      add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-      specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
+      specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon sve/;
    }
  }

@ -554,8 +554,13 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }

-add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir avx2/;
+add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/av1_resize_vert_dir sse2 avx2/;
+
+add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
+# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
+# passes under 32-bit valgrind.
+specialize qw/av1_resize_horz_dir avx2/;

 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
@ -597,13 +602,13 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";

-  specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm sve2/;
  specialize qw/av1_convolve_2d_sr_intrabc neon/;
  specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
  specialize qw/av1_convolve_x_sr_intrabc neon/;
-  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
  specialize qw/av1_convolve_y_sr_intrabc neon/;
-  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_convolve_2d_scale sse4_1 neon/;
  specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/;
  specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@ -159,8 +159,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
 CFL_PREDICT_FN(c, lbd)

 #if CONFIG_AV1_HIGHBITDEPTH
-void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
-                       int alpha_q3, int bit_depth, int width, int height) {
+static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
+                                     int dst_stride, int alpha_q3,
+                                     int bit_depth, int width, int height) {
  for (int j = 0; j < height; j++) {
    for (int i = 0; i < width; i++) {
      dst[i] = clip_pixel_highbd(
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@ -95,6 +95,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3);      \
  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
@ -170,6 +172,8 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst);       \
  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
                                                        int16_t *dst) {      \
    subtract_average_##arch(src, dst, width, height, round_offset,           \
@ -220,22 +224,21 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
    return sub_avg[tx_size % TX_SIZES_ALL];                               \
  }

-// For VSX SIMD optimization, the C versions of width == 4 subtract are
-// faster than the VSX. As such, the VSX code calls the C versions.
-void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
-
-#define CFL_PREDICT_lbd(arch, width, height)                              \
-  void cfl_predict_lbd_##width##x##height##_##arch(                       \
-      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
-      int alpha_q3) {                                                     \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
-                           height);                                       \
+#define CFL_PREDICT_lbd(arch, width, height)                                   \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,                \
+      int alpha_q3) {                                                          \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
+                           height);                                            \
  }

 #if CONFIG_AV1_HIGHBITDEPTH
 #define CFL_PREDICT_hbd(arch, width, height)                                   \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd);                                                                 \
  void cfl_predict_hbd_##width##x##height##_##arch(                            \
      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
      int bd) {                                                                \
--- a/third_party/aom/av1/common/debugmodes.c
+++ b/third_party/aom/av1/common/debugmodes.c
@ -9,17 +9,21 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

+#include "av1/common/debugmodes.h"
+
 #include <stdio.h>

 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"

+#if 0
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
  fprintf(f, "%s", str);
  fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
          cm->show_frame, cm->quant_params.base_qindex);
 }
+
 /* This function dereferences a pointer to the mbmi structure
 * and uses the passed in member offset to print out the value of an integer
 * for each mbmi member value in the mi structure.
@ -87,6 +91,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {

  fclose(mvs);
 }
+#endif  // 0

 void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                         const char *filename) {
--- a/third_party/aom/av1/common/debugmodes.h
+++ b/third_party/aom/av1/common/debugmodes.h
@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_DEBUGMODES_H_
+#define AOM_AV1_COMMON_DEBUGMODES_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file);
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename);
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename);
+
+#endif  // AOM_AV1_COMMON_DEBUGMODES_H_
--- a/third_party/aom/av1/common/ppc/cfl_ppc.c
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@ -124,6 +124,10 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)

 // Based on observation, for small blocks VSX does not outperform C (no 64bit
 // load and store intrinsics). So we call the C code for block widths 4.
+extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
 cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
    cfl_subtract_average_4x4_c,     /* 4x4 */
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@ -337,8 +337,8 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }

-static void down2_symeven(const uint8_t *const input, int length,
-                          uint8_t *output) {
+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset) {
  // Actual filter len = 2 * filter_len_half.
  const int16_t *filter = av1_down2_symeven_half_filter;
  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
@ -350,7 +350,7 @@ static void down2_symeven(const uint8_t *const input, int length,
  l2 += (l2 & 1);
  if (l1 > l2) {
    // Short input length.
-    for (i = 0; i < length; i += 2) {
+    for (i = start_offset; i < length; i += 2) {
      int sum = (1 << (FILTER_BITS - 1));
      for (j = 0; j < filter_len_half; ++j) {
        sum +=
@ -362,7 +362,7 @@ static void down2_symeven(const uint8_t *const input, int length,
    }
  } else {
    // Initial part.
-    for (i = 0; i < l1; i += 2) {
+    for (i = start_offset; i < l1; i += 2) {
      int sum = (1 << (FILTER_BITS - 1));
      for (j = 0; j < filter_len_half; ++j) {
        sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
@ -492,7 +492,7 @@ static void resize_multistep(const uint8_t *const input, int length,
      if (filteredlength & 1)
        down2_symodd(in, filteredlength, out);
      else
-        down2_symeven(in, filteredlength, out);
+        down2_symeven(in, filteredlength, out, 0);
      filteredlength = proj_filteredlength;
    }
    if (filteredlength != olength) {
@ -521,8 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
  }
 }

-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
-                       int height, int height2, int width2, int start_col) {
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+                           int height, int height2, int width2, int start_col) {
  bool mem_status = true;
  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@ -533,7 +533,7 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,

  for (int i = start_col; i < width2; ++i) {
    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    down2_symeven(arrbuf, height, arrbuf2);
+    down2_symeven(arrbuf, height, arrbuf2, 0);
    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
  }

@ -543,10 +543,12 @@ Error:
  return mem_status;
 }

-void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
-                     int height, int filtered_length, int width2) {
+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride,
+                           uint8_t *intbuf, int height, int filtered_length,
+                           int width2) {
  for (int i = 0; i < height; ++i)
-    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i,
+                  0);
 }

 bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
@ -558,10 +560,10 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
  }

  // Resize in the horizontal direction
-  resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+  av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2);
  // Resize in the vertical direction
-  bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
-                                    width2, 0 /*start_col*/);
+  bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height,
+                                        height2, width2, 0 /*start_col*/);
  aom_free(intbuf);
  return mem_status;
 }
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@ -101,6 +101,9 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
                              int in_stride, uint8_t *output, int height2,
                              int width2, int out_stride);

+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset);
+
 bool should_resize_by_half(int height, int width, int height2, int width2);

 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@ -16,6 +16,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "av1/common/convolve.h"

 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
@ -200,31 +201,23 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
    if (w <= 4) {
      __m128i s[8], src6, res, res_round, res16;
      int res_int;
-      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
-      s[0] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
-      s[1] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
-      s[2] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
-      s[3] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
-      s[4] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
-      s[5] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+      s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+                               xx_loadl_32(src_ptr + 1 * src_stride));
+      s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+                               xx_loadl_32(src_ptr + 2 * src_stride));
+      s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+                               xx_loadl_32(src_ptr + 3 * src_stride));
+      s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+                               xx_loadl_32(src_ptr + 4 * src_stride));
+      s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+                               xx_loadl_32(src_ptr + 5 * src_stride));
+      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+      s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);

      do {
-        s[6] = _mm_unpacklo_epi8(
-            src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
-        src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
-        s[7] = _mm_unpacklo_epi8(
-            _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+        s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+        src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+        s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);

        res = convolve_lo_y(s + 0, coeffs);
        res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
--- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@ -15,6 +15,7 @@

 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/synonyms.h"

 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
                                  uint8_t *dst0, int dst_stride0, int w, int h,
@ -178,31 +179,23 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,

  if (w == 4) {
    __m128i s[8], src6, res, res_shift;
-    src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+    s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+                             xx_loadl_32(src_ptr + 1 * src_stride));
+    s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+                             xx_loadl_32(src_ptr + 2 * src_stride));
+    s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+                             xx_loadl_32(src_ptr + 3 * src_stride));
+    s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+                             xx_loadl_32(src_ptr + 4 * src_stride));
+    s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+                             xx_loadl_32(src_ptr + 5 * src_stride));
+    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+    s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);

    do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+      s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+      src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+      s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);

      res = convolve_lo_y(s + 0, coeffs);
      res_shift = _mm_sll_epi32(res, left_shift);
--- a/third_party/aom/av1/common/x86/reconinter_avx2.c
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@ -576,7 +576,7 @@ void av1_build_compound_diffwtd_mask_highbd_avx2(
        }
      }
    } else {
-      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
      if (mask_type == DIFFWTD_38_INV) {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 16) {
--- a/third_party/aom/av1/common/x86/reconinter_ssse3.c
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@ -76,7 +76,7 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(
        }
      }
    } else {
-      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
      if (mask_type == DIFFWTD_38_INV) {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 8) {
--- a/third_party/aom/av1/common/x86/resize_avx2.c
+++ b/third_party/aom/av1/common/x86/resize_avx2.c
@ -41,7 +41,7 @@
  s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
                                                                            \
  __m256i res_out[2] = { 0 };                                               \
-  resize_y_convolve(s, coeffs_y, res_out);                                  \
+  resize_convolve(s, coeffs_y, res_out);                                    \
                                                                            \
  /* r00... r07 */                                                          \
  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
@ -52,7 +52,7 @@
  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
                                                                            \
  __m256i res_out_b[2] = { 0 };                                             \
-  resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
+  resize_convolve(s + 5, coeffs_y, res_out_b);                              \
                                                                            \
  /* r08... r015 */                                                         \
  __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
@ -91,7 +91,7 @@
  s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
                                                                          \
  __m256i res_out[2] = { 0 };                                             \
-  resize_y_convolve(s, coeffs_y, res_out);                                \
+  resize_convolve(s, coeffs_y, res_out);                                  \
                                                                          \
  /* r00... r07 */                                                        \
  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
@ -108,9 +108,107 @@
  res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
  res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);

-static INLINE void resize_y_convolve(const __m256i *const s,
-                                     const __m256i *const coeffs,
-                                     __m256i *res_out) {
+#define PROCESS_RESIZE_X_WD32                                                  \
+  /* a0 a1 ..... a30 a31 */                                                    \
+  __m256i row0 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[i * in_stride + j - filter_offset]);                   \
+  /* b0 b1 ..... b30 b31 */                                                    \
+  __m256i row1 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]);             \
+  /* a0 .... a15 || b0.... b15 */                                              \
+  __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);                    \
+  /* a16 .... a31 || b16 .... b31 */                                           \
+  __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);                    \
+  filter_offset = 3;                                                           \
+                                                                               \
+  /* Pad start pixels to the left, while processing the first pixels in the    \
+    row. */                                                                    \
+  if (j == 0) {                                                                \
+    /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
+    row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
+    /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */                   \
+    row1 = _mm256_alignr_epi8(r1, r0, 13);                                     \
+    r0 = row0;                                                                 \
+    r1 = row1;                                                                 \
+  }                                                                            \
+                                                                               \
+  /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
+  __m128i row0_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[i * in_stride + 32 + j - filter_offset]);              \
+  /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
+  __m128i row1_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]);        \
+  __m256i r2 = _mm256_permute2x128_si256(                                      \
+      _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
+                                                                               \
+  /* Pad end pixels to the right, while processing the last pixels in the      \
+  row. */                                                                      \
+  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+  if (is_last_cols32) {                                                        \
+    r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask);                           \
+  }                                                                            \
+                                                                               \
+  /* Process even pixels of the first row  */                                  \
+  /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */                \
+  s0[0] = _mm256_alignr_epi8(r1, r0, 0);                                       \
+  /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */                \
+  s0[1] = _mm256_alignr_epi8(r1, r0, 2);                                       \
+  /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */                \
+  s0[2] = _mm256_alignr_epi8(r1, r0, 4);                                       \
+  /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */                \
+  s0[3] = _mm256_alignr_epi8(r1, r0, 6);                                       \
+                                                                               \
+  /* Process even pixels of the second row  */                                 \
+  /* a13 a14 a15 a16  ..... a28 | b13 b14 b15 b16 ..... b28 */                 \
+  s1[0] = _mm256_alignr_epi8(r2, r1, 0);                                       \
+  /* a15 a16 a17 a18  ..... a30 | b15 b16 b17 b18 ..... b30 */                 \
+  s1[1] = _mm256_alignr_epi8(r2, r1, 2);                                       \
+  /* a17 a18 a19 a20  ..... a32 | b17 b18 b19 b20 ..... b32 */                 \
+  s1[2] = _mm256_alignr_epi8(r2, r1, 4);                                       \
+  /* a19 a20 a21 a22  ..... a34 | b19 b20 b21 b22 ..... b34 */                 \
+  s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
+                                                                               \
+  /* The register res_out_0 stores the result of start-16 pixels corresponding \
+to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
+  __m256i res_out_0[2], res_out_1[2];                                          \
+  res_out_1[0] = res_out_1[1] = zero;                                          \
+  res_out_0[0] = res_out_0[1] = zero;                                          \
+  resize_convolve(s0, coeffs_x, res_out_0);                                    \
+  resize_convolve(s1, coeffs_x, res_out_1);                                    \
+                                                                               \
+  /* Result of 32 pixels of row0 (a0 to a32) */                                \
+  res_out_0[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);     \
+  res_out_1[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits);     \
+  /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
+  __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
+                                                                               \
+  /* result of 32 pixels of row1 (b0 to b32) */                                \
+  res_out_0[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
+  res_out_1[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits);     \
+  /* r10-r13 r18-r111 | r14-r17 r112-r115 */                                   \
+  __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]);        \
+                                                                               \
+  /* Convert the result from 16bit to 8bit */                                  \
+  /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115   \
+   */                                                                          \
+  __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1);           \
+  __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel);            \
+  res_out_row01 = _mm256_max_epu8(res_out_r01, zero);                          \
+  __m128i low_128 = CAST_LOW(res_out_row01);                                   \
+  __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1);               \
+                                                                               \
+  _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2],                 \
+                   _mm_unpacklo_epi32(low_128, high_128));                     \
+  _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
+                   _mm_unpackhi_epi32(low_128, high_128));
+
+static INLINE void resize_convolve(const __m256i *const s,
+                                   const __m256i *const coeffs,
+                                   __m256i *res_out) {
  const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
@ -152,8 +250,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
  coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
 }

-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
-                          int height, int height2, int stride, int start_col) {
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
  assert(start_col <= stride);
  // For the GM tool, the input layer height or width is assured to be an even
  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
@ -164,8 +263,8 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
  // eliminate the need for conditional statements within the subsequent SIMD
  // code to manage these cases.
  if (height & 1 || height < 8) {
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, start_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
  }

  __m256i s[10], coeffs_y[4];
@ -174,7 +273,7 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
  const uint8_t max_pixel = 255;
-  const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
+  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
  const __m256i zero = _mm256_setzero_si256();

  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
@ -404,8 +503,212 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
  }

  if (remain_col)
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, stride - remain_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);

  return true;
 }
+
+// Masks used for width 32 and 8 pixels, with left and right padding
+// requirements
+static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12,
+                                                    0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12 };
+
+static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2,
+                                                     0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const uint8_t wd8_right_padding_mask[32] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
+};
+
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Invoke C for width less than 32.
+  // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
+  // passes under 32-bit valgrind.
+  if (filtered_length < 32) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+    return;
+  }
+
+  const int filt_length = sizeof(av1_down2_symeven_half_filter);
+  assert(filt_length % 2 == 0);
+  (void)filt_length;
+
+  __m256i s0[4], s1[4], coeffs_x[4];
+
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+
+  const uint8_t max_pixel = 255;
+  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i wd32_start_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
+  const __m256i wd32_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
+  const __m256i wd8_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
+  // to generate output corresponding to 2 rows. To streamline the core loop and
+  // eliminate the need for conditional checks, the remaining columns (16 or 8)
+  // are processed separately.
+  if (filtered_length % 32 == 0) {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      for (int j = 0; j < filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+    }
+  } else {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      int remain_col = filtered_length % 32;
+      for (int j = 0; j + 32 <= filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+
+      int wd_processed = filtered_length - remain_col;
+      if (remain_col > 15) {
+        remain_col = filtered_length % 16;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // a16 a17 --- a23
+        row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+        // b16 b17 --- b23
+        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+
+        // a16-a23 x x x x| b16-b23 x x x x
+        __m256i r1 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols16 = wd_processed + 16 == filtered_length;
+        if (is_last_cols16) {
+          r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+        }
+
+        // a0 a1 --- a15 || b0 b1 --- b15
+        s0[0] = r0;
+        // a2 a3 --- a17 || b2 b3 --- b17
+        s0[1] = _mm256_alignr_epi8(r1, r0, 2);
+        // a4 a5 --- a19 || b4 b5 --- b19
+        s0[2] = _mm256_alignr_epi8(r1, r0, 4);
+        // a6 a7 --- a21 || b6 b7 --- b21
+        s0[3] = _mm256_alignr_epi8(r1, r0, 6);
+
+        // result for 16 pixels (a0 to a15) of row0 and row1
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 -r07
+        res_out_0[0] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
+        // r10-r17
+        res_out_0[1] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
+        // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m128i low_result =
+            CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
+        // r00-r03 r04-r07 r10-r13 r14-r17
+        low_result = _mm_shuffle_epi32(low_result, 0xd8);
+
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
+                         _mm_unpackhi_epi64(low_result, low_result));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      if (remain_col > 7) {
+        remain_col = filtered_length % 8;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+        if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+
+        // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
+        s0[0] = r0;
+        // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
+        s0[1] = _mm256_bsrli_epi128(r0, 2);
+        // a4 a5 a6 a7 a8 a9 a10 a10 |  b4 b5 b6 b7 b8 b9 b10 b10
+        s0[2] = _mm256_bsrli_epi128(r0, 4);
+        // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
+        s0[3] = _mm256_bsrli_epi128(r0, 6);
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 - r03 | r10 - r13
+        __m256i res_out =
+            _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
+        // r00 - r03 | r10 - r13
+        res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
+                                   round_shift_bits);
+        // r00-r03 r00-r03 r10-r13 r10-r13
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
+        // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+
+        xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
+        xx_storel_32(intbuf + out_idx + dst_stride,
+                     _mm256_extracti128_si256(res_out_row01, 1));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      // When the remaining width is 2, the above code would not have taken
+      // care of padding required for (filtered_length - 4)th pixel. Hence,
+      // process that pixel again with the C code.
+      wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+      if (remain_col) {
+        const int in_idx = (in_stride * i);
+        const int out_idx = (wd_processed / 2) + width2 * i;
+
+        down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                      wd_processed);
+        down2_symeven(input + in_idx + in_stride, filtered_length,
+                      intbuf + out_idx + width2, wd_processed);
+      }
+    }
+  }
+}
--- a/third_party/aom/av1/common/x86/resize_sse2.c
+++ b/third_party/aom/av1/common/x86/resize_sse2.c
@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define PROCESS_RESIZE_Y_WD8                                           \
+  /* ah0 ah1 ... ah7 */                                                \
+  const __m128i AH = _mm_add_epi16(l0, l7);                            \
+  /* bg0 bg1 ... bh7 */                                                \
+  const __m128i BG = _mm_add_epi16(l1, l6);                            \
+  /* cf0 cf1 ... cf7 */                                                \
+  const __m128i CF = _mm_add_epi16(l2, l5);                            \
+  /* de0 de1 ... de7 */                                                \
+  const __m128i DE = _mm_add_epi16(l3, l4);                            \
+                                                                       \
+  /* ah0 bg0 ... ah3 bg3 */                                            \
+  const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG);                 \
+  /*cf0 de0 ... cf2 de2 */                                             \
+  const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE);                 \
+                                                                       \
+  /* ah4 bg4... ah7 bg7 */                                             \
+  const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG);                  \
+  /* cf4 de4... cf7 de7 */                                             \
+  const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE);                  \
+                                                                       \
+  /* r00 r01 r02 r03 */                                                \
+  const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]);           \
+  const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]);           \
+  __m128i r0 = _mm_add_epi32(r00, r01);                                \
+  /* r04 r05 r06 r07 */                                                \
+  const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]);            \
+  const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]);            \
+  __m128i r1 = _mm_add_epi32(r10, r11);                                \
+                                                                       \
+  r0 = _mm_add_epi32(r0, round_const_bits);                            \
+  r1 = _mm_add_epi32(r1, round_const_bits);                            \
+  r0 = _mm_sra_epi32(r0, round_shift_bits);                            \
+  r1 = _mm_sra_epi32(r1, round_shift_bits);                            \
+                                                                       \
+  /* r00 ... r07 (8 values of each 16bit) */                           \
+  const __m128i res_16b = _mm_packs_epi32(r0, r1);                     \
+  /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */             \
+  const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b);          \
+                                                                       \
+  __m128i res = _mm_min_epu8(res_8b0, clip_pixel);                     \
+  res = _mm_max_epu8(res, zero);                                       \
+  _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
+                                                                       \
+  l0 = l2;                                                             \
+  l1 = l3;                                                             \
+  l2 = l4;                                                             \
+  l3 = l5;                                                             \
+  l4 = l6;                                                             \
+  l5 = l7;                                                             \
+  data += 2 * stride;
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+                                         __m128i *const coeffs /* [2] */) {
+  // f0 f1 f2 f3 x x x x
+  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+
+  // f1 f0 f3 f2 x x x x
+  const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
+
+  // f3 f2 f3 f2 ...
+  coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
+  // f1 f0 f1 f0 ...
+  coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
+}
+
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
+  // For the GM tool, the input layer height or width is assured to be an even
+  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+  // optimization of the same is not implemented.
+  // When the input height is less than 8 and even, the potential input
+  // heights are limited to 2, 4, or 6. These scenarios require seperate
+  // handling due to padding requirements. Invoking the C function here will
+  // eliminate the need for conditional statements within the subsequent SIMD
+  // code to manage these cases.
+  if (height & 1 || height < 8) {
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
+  }
+
+  __m128i coeffs_y[2];
+  const int bits = FILTER_BITS;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+  const int remain_col = stride % 8;
+
+  for (int j = start_col; j < stride - remain_col; j += 8) {
+    uint8_t *data = &intbuf[j];
+    // d0 ... d7
+    const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with the last available row at the top.
+    // a0 ... a7
+    const __m128i l8_0 = l8_3;
+    // b0 ... b7
+    const __m128i l8_1 = l8_3;
+    // c0 ... c7
+    const __m128i l8_2 = l8_3;
+    // e0 ... e7
+    const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+    // f0 ... f7
+    const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+
+    // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
+    __m128i l0 = _mm_unpacklo_epi8(l8_0, zero);  // A(128bit) = a0 - a7(16 bit)
+    __m128i l1 = _mm_unpacklo_epi8(l8_1, zero);  // B(128bit) = b0 - b7(16 bit)
+    __m128i l2 = _mm_unpacklo_epi8(l8_2, zero);  // C(128bit) = c0 - c7(16 bit)
+    __m128i l3 = _mm_unpacklo_epi8(l8_3, zero);  // D(128bit) = d0 - d7(16 bit)
+    __m128i l4 = _mm_unpacklo_epi8(l8_4, zero);  // E(128bit) = e0 - e7(16 bit)
+    __m128i l5 = _mm_unpacklo_epi8(l8_5, zero);  // F(128bit) = f0 - f7(16 bit)
+
+    // Increment the pointer such that the loading starts from row G.
+    data = data + 3 * stride;
+    // The core vertical SIMD processes 2 input rows simultaneously to generate
+    // output corresponding to 1 row. To streamline the core loop and eliminate
+    // the need for conditional checks, the remaining rows 4 are processed
+    // separately.
+    for (int i = 0; i < height - 4; i += 2) {
+      // g0 ... g7
+      __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+      // h0 ... h7
+      __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
+      __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);  // G(128bit):g0-g7(16b)
+      __m128i l7 = _mm_unpacklo_epi8(l8_7, zero);  // H(128bit):h0-h7(16b)
+
+      PROCESS_RESIZE_Y_WD8
+    }
+
+    __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+    __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
+    // Process the last 4 input rows here.
+    for (int i = height - 4; i < height; i += 2) {
+      __m128i l7 = l6;
+      PROCESS_RESIZE_Y_WD8
+    }
+  }
+
+  if (remain_col)
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);
+
+  return true;
+}
+
+// Blends a and b using mask and returns the result.
+static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
+  const __m128i masked_b = _mm_and_si128(mask, b);
+  const __m128i masked_a = _mm_andnot_si128(mask, a);
+  return (_mm_or_si128(masked_a, masked_b));
+}
+
+// Masks used for width 16 pixels, with left and right padding
+// requirements.
+static const uint8_t left_padding_mask[16] = {
+  255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const uint8_t right_padding_mask[16] = { 0,   0,   0,   0,  0,   0,
+                                                0,   0,   0,   0,  255, 255,
+                                                255, 255, 255, 255 };
+
+static const uint8_t mask_16[16] = {
+  255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
+};
+
+void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Invoke C for width less than 16.
+  if (filtered_length < 16) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+    return;
+  }
+
+  __m128i coeffs_x[2];
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const int remain_col = filtered_length % 16;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
+  const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
+  const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  for (int i = 0; i < height; ++i) {
+    int filter_offset = 0;
+    for (int j = 0; j <= filtered_length - 16; j += 16) {
+      const int in_idx = i * in_stride + j - filter_offset;
+      const int out_idx = i * dst_stride + j / 2;
+
+      // a0 a1 a2 a3 .... a15
+      __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+      // a8 a9 a10 a11 .... a23
+      __m128i row01 =
+          _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+      filter_offset = 3;
+
+      // Pad start pixels to the left, while processing the first pixels in the
+      // row.
+      if (j == 0) {
+        const __m128i start_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride]);
+        row00 =
+            blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
+      }
+
+      // Pad end pixels to the right, while processing the last pixels in the
+      // row.
+      const int is_last_cols16 = (j == filtered_length - 16);
+      if (is_last_cols16) {
+        const __m128i end_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
+        row01 = blend(row01, end_pixel_row0, end_pad_mask);
+      }
+
+      // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
+      const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
+                                                _mm_srli_si128(row01, 2));
+      // a4 a5 a6 a7 a9 10 a11 a12 .... a19
+      const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
+                                                _mm_srli_si128(row01, 4));
+      // a6 a7 a8 a9 a10 a11 a12 a13 .... a21
+      const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
+                                                _mm_srli_si128(row01, 6));
+
+      // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
+      const __m128i s0 = _mm_and_si128(row00, mask_even);
+      // a1 a3 a5 a7 a9 a11 a13 a15
+      const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
+      // a2 a4 a6 a8 a10 a12 a14 a16
+      const __m128i s2 = _mm_and_si128(row0_1, mask_even);
+      // a3 a5 a7 a9 a11 a13 a15 a17
+      const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
+      // a4 a6 a8 a10 a12 a14 a16 a18
+      const __m128i s4 = _mm_and_si128(row0_2, mask_even);
+      // a5 a7 a9 a11 a13 a15 a17 a19
+      const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
+      // a6 a8 a10 a12 a14 a16 a18 a20
+      const __m128i s6 = _mm_and_si128(row0_3, mask_even);
+      // a7 a9 a11 a13 a15 a17 a19 a21
+      const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
+
+      // a0a7 a2a9 a4a11 .... a12a19 a14a21
+      const __m128i s07 = _mm_add_epi16(s0, s7);
+      // a1a6 a3a8 a5a10 .... a13a18 a15a20
+      const __m128i s16 = _mm_add_epi16(s1, s6);
+      // a2a5 a4a7 a6a9  .... a14a17 a16a19
+      const __m128i s25 = _mm_add_epi16(s2, s5);
+      // a3a4 a5a6 a7a8  .... a15a16 a17a18
+      const __m128i s34 = _mm_add_epi16(s3, s4);
+
+      // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
+      const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
+      // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
+      const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
+
+      // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
+      const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
+      // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
+      const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
+
+      const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
+      const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
+      const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
+      const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
+
+      // Result of first 8 pixels of row0 (a0 to a7).
+      // r0_0 r0_1 r0_2 r0_3
+      __m128i r00 = _mm_add_epi32(r01_0, r01_1);
+      r00 = _mm_add_epi32(r00, round_const_bits);
+      r00 = _mm_sra_epi32(r00, round_shift_bits);
+
+      // Result of next 8 pixels of row0 (a8 to 15).
+      // r0_4 r0_5 r0_6 r0_7
+      __m128i r01 = _mm_add_epi32(r01_2, r01_3);
+      r01 = _mm_add_epi32(r01, round_const_bits);
+      r01 = _mm_sra_epi32(r01, round_shift_bits);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      const __m128i res_16 = _mm_packs_epi32(r00, r01);
+      const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
+      __m128i res = _mm_min_epu8(res_8, clip_pixel);
+      res = _mm_max_epu8(res, zero);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      _mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
+    }
+
+    int wd_processed = filtered_length - remain_col;
+    // When the remaining width is 2, the above code would not have taken
+    // care of padding required for (filtered_length - 4)th pixel. Hence,
+    // process that pixel again with the C code.
+    wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+    if (remain_col) {
+      const int in_idx = (in_stride * i);
+      const int out_idx = (wd_processed / 2) + width2 * i;
+
+      down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                    wd_processed);
+    }
+  }
+}
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@ -2241,6 +2241,12 @@ static AOM_INLINE void get_ls_tile_buffer(
  if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
    // The remaining bits in the top byte signal the row offset
    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+    if (offset > row) {
+      aom_internal_error(
+          error_info, AOM_CODEC_CORRUPT_FRAME,
+          "Invalid row offset in tile copy mode: row=%d offset=%d", row,
+          offset);
+    }

    // Currently, only use tiles in same column as reference tiles.
    copy_data = tile_buffers[row - offset][col].data;
--- a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
@ -13,6 +13,7 @@
 #include <assert.h>

 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/mem_neon.h"
@ -63,7 +64,7 @@ int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 }

 int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
+                                intptr_t block_size) {
  uint64x2_t err_u64 = vdupq_n_u64(0);

  assert(block_size >= 16);
--- a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
@ -12,6 +12,7 @@
 #include <assert.h>

 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
@ -49,7 +50,7 @@ int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 }

 int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
-                               int block_size) {
+                               intptr_t block_size) {
  if (block_size % 32 == 0) {
    int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
                           vdupq_n_s64(0) };
--- a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
@ -12,7 +12,7 @@

 #include "aom_dsp/arm/sum_neon.h"
 #include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"

 static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
  const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
--- a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
--- a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
+++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
@ -19,6 +19,7 @@
 #include <stdint.h>

 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"

 #define CRC_LOOP(op, crc, type, buf, len) \
  while ((len) >= sizeof(type)) {         \
--- a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
@ -15,7 +15,7 @@

 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/arm/pickrst_neon.h"
 #include "av1/encoder/pickrst.h"

 static INLINE void highbd_calc_proj_params_r0_r1_neon(
--- a/third_party/aom/av1/encoder/arm/highbd_pickrst_sve.c
+++ b/third_party/aom/av1/encoder/arm/highbd_pickrst_sve.c
@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/arm/pickrst_sve.h"
+
+static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride,
+                                        int width, int height) {
+  uint64x2_t avg_u64 = vdupq_n_u64(0);
+  uint16x8_t ones = vdupq_n_u16(1);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  int h = height;
+  do {
+    int j = width;
+    const uint16_t *src_ptr = src;
+    while (j > 8) {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      avg_u64 = aom_udotq_u16(avg_u64, s, ones);
+
+      j -= 8;
+      src_ptr += 8;
+    }
+    uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr));
+    avg_u64 = aom_udotq_u16(avg_u64, s_end, ones);
+
+    src += src_stride;
+  } while (--h != 0);
+  return (uint16_t)(vaddvq_u64(avg_u64) / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride,
+                                   int16_t avg, int16_t *buf_avg,
+                                   int buf_avg_stride, int width, int height) {
+  uint16x8_t avg_u16 = vdupq_n_u16(avg);
+
+  // Use a predicate to compute the last columns.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg));
+
+  do {
+    int j = width;
+    const uint16_t *buf_ptr = buf;
+    int16_t *buf_avg_ptr = buf_avg;
+    while (j > 8) {
+      uint16x8_t d = vld1q_u16(buf_ptr);
+      vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16)));
+
+      j -= 8;
+      buf_ptr += 8;
+      buf_avg_ptr += 8;
+    }
+    uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr));
+    vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end)));
+
+    buf += buf_stride;
+    buf_avg += buf_avg_stride;
+  } while (--height > 0);
+}
+
+static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+                                       const int wiener_win2,
+                                       const int divider) {
+  for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+    // Transpose the first 2x2 square. It needs a special case as the element
+    // of the bottom left is on the diagonal.
+    int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
+    int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
+
+    int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+    vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+    vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
+
+    // Transpose and store all the remaining 2x2 squares of the line.
+    for (int j = i + 3; j < wiener_win2; j = j + 2) {
+      row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
+      row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
+
+      int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+      int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+      vst1q_s64(H_tmp + (j + 0) * wiener_win2 + i, tr_row0);
+      vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
+    }
+  }
+  for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
+    H[i] += H_tmp[i] / divider;
+  }
+}
+
+// Transpose the matrix that has just been computed and accumulate it in M.
+static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+                                   const int wiener_win, const int divider) {
+  for (int i = 0; i < wiener_win; ++i) {
+    for (int j = 0; j < wiener_win; ++j) {
+      int tr_idx = j * wiener_win + i;
+      *M++ += (int64_t)(M_trn[tr_idx] / divider);
+    }
+  }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win7_sve(
+    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
+    int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
+  const int wiener_win = 7;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[49];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[49 * 49];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[7];
+        load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j < width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[7];
+        load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a predicate to discard excess elements.
+    for (int col0 = 0; col0 < wiener_win; col0++) {
+      // Load first column.
+      int16x8_t dgd0[7];
+      dgd0[0] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+      dgd0[1] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+      dgd0[2] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+      dgd0[3] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+      dgd0[4] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+      dgd0[5] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
+      dgd0[6] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
+
+      // Perform computation of the first column with itself (28 elements).
+      // For the first column this will fill the upper triangle of the 7x7
+      // matrix at the top left of the H matrix. For the next columns this
+      // will fill the upper triangle of the other 7x7 matrices around H's
+      // diagonal.
+      compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+      // All computation next to the matrix diagonal has already been done.
+      for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+        // Load second column and scale based on downsampling factor.
+        int16x8_t dgd1[7];
+        load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                     &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+        // Compute all elements from the combination of both columns (49
+        // elements).
+        compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+      }
+    }
+    dgd_avg += dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 7, bit_depth_divider);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win5_sve(
+    int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
+    int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) {
+  const int wiener_win = 5;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  // Use a predicate to compute the last columns of the block for H.
+  svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8);
+
+  // Use intermediate matrices for H and M to perform the computation, they
+  // will be accumulated into the original H and M at the end.
+  int64_t M_trn[25];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int64_t H_tmp[25 * 25];
+  memset(H_tmp, 0, sizeof(H_tmp));
+
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int j = 0;
+      while (j < width) {
+        int16x8_t dgd[5];
+        load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+                     &dgd[2], &dgd[3], &dgd[4]);
+        int16x8_t s = vld1q_s16(src_avg + j);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd, M_trn, row);
+
+        j += 8;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j < width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+                     &dgd0[2], &dgd0[3], &dgd0[4]);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                       &dgd1[2], &dgd1[3], &dgd1[4]);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a predicate to discard excess elements.
+    for (int col0 = 0; col0 < wiener_win; col0++) {
+      int16x8_t dgd0[5];
+      dgd0[0] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+      dgd0[1] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+      dgd0[2] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+      dgd0[3] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+      dgd0[4] = svget_neonq_s16(
+          svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+
+      // Perform computation of the first column with itself (15 elements).
+      // For the first column this will fill the upper triangle of the 5x5
+      // matrix at the top left of the H matrix. For the next columns this
+      // will fill the upper triangle of the other 5x5 matrices around H's
+      // diagonal.
+      compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+      // All computation next to the matrix diagonal has already been done.
+      for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+        // Load second column and scale based on downsampling factor.
+        int16x8_t dgd1[5];
+        load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+                     &dgd1[2], &dgd1[3], &dgd1[4]);
+
+        // Compute all elements from the combination of both columns (25
+        // elements).
+        compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+      }
+    }
+    dgd_avg += dgd_avg_stride;
+    src_avg += src_avg_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  acc_transpose_M(M, M_trn, 5, bit_depth_divider);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider);
+}
+
+void av1_compute_stats_highbd_sve(int wiener_win, const uint8_t *dgd8,
+                                  const uint8_t *src8, int16_t *dgd_avg,
+                                  int16_t *src_avg, int h_start, int h_end,
+                                  int v_start, int v_end, int dgd_stride,
+                                  int src_stride, int64_t *M, int64_t *H,
+                                  aom_bit_depth_t bit_depth) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = wiener_win >> 1;
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const uint16_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
+
+  const uint16_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
+
+  // dgd_avg and src_avg have been memset to zero before calling this function
+  // so round up the stride to the next multiple of 8 so that we don't have to
+  // worry about a tail loop when computing M.
+  const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
+  const int src_avg_stride = (width & ~7) + 8;
+
+  // Compute (dgd - avg) and store it in dgd_avg.
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin);
+
+  // Compute (src - avg), downsample if necessary and store in src-avg.
+  const uint16_t *src_start = src + h_start + v_start * src_stride;
+  compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
+                  height);
+
+  if (wiener_win == WIENER_WIN) {
+    highbd_compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg,
+                                  src_avg_stride, width, height, M, H,
+                                  bit_depth_divider);
+  } else {
+    highbd_compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg,
+                                  src_avg_stride, width, height, M, H,
+                                  bit_depth_divider);
+  }
+}
--- a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
--- a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@ -12,6 +12,7 @@
 #include <arm_neon.h>

 #include "aom_dsp/txfm_common.h"
+#include "config/av1_rtcd.h"

 static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
  int32x4x2_t b0 =
--- a/third_party/aom/av1/encoder/arm/neon/ml_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
--- a/Показать больше
+++ b/Показать больше