Bug 1476408 - Update libaom to rev b25610052a1398032320008d69b51d2da94f5928; r=TD-Linux

Tags: #secure-revision Bug #: 1476408 Differential Revision: https://phabricator.services.mozilla.com/D2358 --HG-- extra : rebase_source : fa2438ada27a67e400617705014460b6d5ff485c
2018-07-23 14:28:45 -04:00 · 2018-07-23 14:28:45 -04:00 · 48e87ceaf3
--- a/media/libaom/README_MOZILLA
+++ b/media/libaom/README_MOZILLA
@ -22,4 +22,4 @@ To update to a fork, use

 The last update was pulled from https://aomedia.googlesource.com/aom/

-The git commit ID used was d14c5bb4f336ef1842046089849dee4a301fbbf0 (Mon Jun 25 07:54:59 2018 -0700).
+The git commit ID used was b25610052a1398032320008d69b51d2da94f5928 (Mon Jul 23 18:08:58 2018 +0000).
--- a/media/libaom/config/generic/config/aom_config.asm
+++ b/media/libaom/config/generic/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/generic/config/aom_config.h
+++ b/media/libaom/config/generic/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/generic/config/av1_rtcd.h
+++ b/media/libaom/config/generic/config/av1_rtcd.h
@ -63,22 +63,22 @@ void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask
 void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_c

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_2d_sr av1_convolve_2d_sr_c

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_x_sr av1_convolve_x_sr_c

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_convolve_y_sr av1_convolve_y_sr_c

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
@ -108,13 +108,13 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@ -126,10 +126,10 @@ void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@ -147,16 +147,16 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -225,16 +225,16 @@ void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride
 void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 #define av1_inv_txfm_add av1_inv_txfm_add_c

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_jnt_convolve_2d av1_jnt_convolve_2d_c

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_c

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_jnt_convolve_x av1_jnt_convolve_x_c

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
 #define av1_jnt_convolve_y av1_jnt_convolve_y_c

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
--- a/media/libaom/config/linux/arm/config/aom_config.asm
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@ -24,6 +24,7 @@
 .equ CONFIG_COLLECT_INTER_MODE_RD_STATS, 1
 .equ CONFIG_COLLECT_RD_STATS, 0
 .equ CONFIG_DEBUG, 0
+.equ CONFIG_DENOISE, 0
 .equ CONFIG_DIST_8X8, 1
 .equ CONFIG_ENTROPY_STATS, 0
 .equ CONFIG_FILEOPTIONS, 1
--- a/media/libaom/config/linux/arm/config/aom_config.h
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
@ -1041,13 +1041,15 @@ void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const
 RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);

 void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_c
+void aom_lpf_horizontal_14_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

 void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_c

 void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_c
+void aom_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

 void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_c
@ -1074,13 +1076,15 @@ void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, c
 #define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_c

 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define aom_lpf_vertical_4 aom_lpf_vertical_4_c
+void aom_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

 void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_c

 void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define aom_lpf_vertical_6 aom_lpf_vertical_6_c
+void aom_lpf_vertical_6_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);

 void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_c
@ -1468,12 +1472,20 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_NEON) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_neon;
    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
    if (flags & HAS_NEON) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_neon;
+    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_neon;
+    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_neon;
    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
    if (flags & HAS_NEON) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_neon;
    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
    if (flags & HAS_NEON) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_neon;
    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
    if (flags & HAS_NEON) aom_lpf_vertical_14 = aom_lpf_vertical_14_neon;
+    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_4 = aom_lpf_vertical_4_neon;
+    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_6 = aom_lpf_vertical_6_neon;
    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
    if (flags & HAS_NEON) aom_lpf_vertical_8 = aom_lpf_vertical_8_neon;
    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
--- a/media/libaom/config/linux/arm/config/av1_rtcd.h
+++ b/media/libaom/config/linux/arm/config/av1_rtcd.h
@ -52,7 +52,8 @@ extern "C" {
 #endif

 void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
-#define apply_selfguided_restoration apply_selfguided_restoration_c
+void apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);

 void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
 #define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
@ -64,27 +65,27 @@ RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_M
 void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 #define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
 #define av1_convolve_2d_scale av1_convolve_2d_scale_c

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 #define av1_convolve_horiz_rs av1_convolve_horiz_rs_c

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -113,13 +114,13 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
@ -131,10 +132,10 @@ void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 #define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
@ -152,16 +153,16 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 #define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -228,28 +229,34 @@ void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride
 #define av1_inv_txfm2d_add_8x8 av1_inv_txfm2d_add_8x8_c

 void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_inv_txfm_add av1_inv_txfm_add_c
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                  int sgr_params_idx, int bit_depth, int highbd);
-#define av1_selfguided_restoration av1_selfguided_restoration_c
+void av1_selfguided_restoration_neon(const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                  int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN void (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                  int sgr_params_idx, int bit_depth, int highbd);

 void av1_upsample_intra_edge_c(uint8_t *p, int sz);
 #define av1_upsample_intra_edge av1_upsample_intra_edge_c
@ -328,6 +335,8 @@ static void setup_rtcd_internal(void)

    (void)flags;

+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_NEON) apply_selfguided_restoration = apply_selfguided_restoration_neon;
    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
    if (flags & HAS_NEON) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_neon;
    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
@ -338,6 +347,8 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_NEON) av1_convolve_x_sr = av1_convolve_x_sr_neon;
    av1_convolve_y_sr = av1_convolve_y_sr_c;
    if (flags & HAS_NEON) av1_convolve_y_sr = av1_convolve_y_sr_neon;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_NEON) av1_inv_txfm_add = av1_inv_txfm_add_neon;
    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
    if (flags & HAS_NEON) av1_jnt_convolve_2d = av1_jnt_convolve_2d_neon;
    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
@ -346,6 +357,8 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_NEON) av1_jnt_convolve_x = av1_jnt_convolve_x_neon;
    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
    if (flags & HAS_NEON) av1_jnt_convolve_y = av1_jnt_convolve_y_neon;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_NEON) av1_selfguided_restoration = av1_selfguided_restoration_neon;
    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
    if (flags & HAS_NEON) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_neon;
    cdef_filter_block = cdef_filter_block_c;
--- a/media/libaom/config/linux/ia32/config/aom_config.asm
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/linux/ia32/config/aom_config.h
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/linux/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -127,19 +127,19 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -151,15 +151,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -176,25 +176,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -272,25 +272,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/linux/x64/config/aom_config.asm
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/linux/x64/config/aom_config.h
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/linux/x64/config/av1_rtcd.h
+++ b/media/libaom/config/linux/x64/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -130,19 +130,19 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -154,15 +154,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -179,25 +179,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -275,25 +275,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/mac/x64/config/aom_config.asm
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/mac/x64/config/aom_config.h
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/mac/x64/config/av1_rtcd.h
+++ b/media/libaom/config/mac/x64/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -130,19 +130,19 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -154,15 +154,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -179,25 +179,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -275,25 +275,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/win/ia32/config/aom_config.asm
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/win/ia32/config/aom_config.h
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/win/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/win/ia32/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -127,19 +127,19 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -151,15 +151,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -176,25 +176,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -272,25 +272,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/win/mingw32/config/aom_config.asm
+++ b/media/libaom/config/win/mingw32/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/win/mingw32/config/aom_config.h
+++ b/media/libaom/config/win/mingw32/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/win/mingw32/config/av1_rtcd.h
+++ b/media/libaom/config/win/mingw32/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -127,19 +127,19 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -151,15 +151,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -176,25 +176,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -272,25 +272,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/win/mingw64/config/aom_config.asm
+++ b/media/libaom/config/win/mingw64/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/win/mingw64/config/aom_config.h
+++ b/media/libaom/config/win/mingw64/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/win/mingw64/config/av1_rtcd.h
+++ b/media/libaom/config/win/mingw64/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -130,19 +130,19 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -154,15 +154,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -179,25 +179,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -275,25 +275,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/config/win/x64/config/aom_config.asm
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@ -24,6 +24,7 @@ CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 0
 CONFIG_DIST_8X8 equ 1
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
--- a/media/libaom/config/win/x64/config/aom_config.h
+++ b/media/libaom/config/win/x64/config/aom_config.h
@ -26,6 +26,7 @@
 #define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 0
 #define CONFIG_DIST_8X8 1
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
--- a/media/libaom/config/win/x64/config/av1_rtcd.h
+++ b/media/libaom/config/win/x64/config/av1_rtcd.h
@ -69,33 +69,33 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TY
 void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
 RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);

-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);

-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
 RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);

-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
 #define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@ -130,19 +130,19 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
@ -154,15 +154,15 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);

-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
 #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@ -179,25 +179,25 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
 #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c

-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);

 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -275,25 +275,25 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);

-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);

 void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
--- a/media/libaom/sources.mozbuild
+++ b/media/libaom/sources.mozbuild
@ -58,14 +58,15 @@ files = {
    '../../third_party/aom/aom_util/debug_util.c',
    '../../third_party/aom/av1/av1_dx_iface.c',
    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
    '../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
    '../../third_party/aom/av1/common/arm/blend_a64_vmask_neon.c',
    '../../third_party/aom/av1/common/arm/cfl_neon.c',
    '../../third_party/aom/av1/common/arm/convolve_neon.c',
-    '../../third_party/aom/av1/common/arm/intrapred_neon.c',
    '../../third_party/aom/av1/common/arm/jnt_convolve_neon.c',
    '../../third_party/aom/av1/common/arm/reconinter_neon.c',
+    '../../third_party/aom/av1/common/arm/selfguided_neon.c',
    '../../third_party/aom/av1/common/arm/wiener_convolve_neon.c',
    '../../third_party/aom/av1/common/av1_inv_txfm1d.c',
    '../../third_party/aom/av1/common/av1_inv_txfm2d.c',
@ -82,7 +83,6 @@ files = {
    '../../third_party/aom/av1/common/entropy.c',
    '../../third_party/aom/av1/common/entropymode.c',
    '../../third_party/aom/av1/common/entropymv.c',
-    '../../third_party/aom/av1/common/filter.c',
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
@ -109,6 +109,8 @@ files = {
    '../../third_party/aom/av1/decoder/dthread.c',
    '../../third_party/aom/av1/decoder/obu.c',
    '../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
+    '../../third_party/aom/stats/aomstats.c',
+    '../../third_party/aom/stats/rate_hist.c',
  ],
  'GENERIC_EXPORTS': [
    '../../third_party/aom/aom/aom.h',
@ -174,7 +176,6 @@ files = {
    '../../third_party/aom/av1/common/entropy.c',
    '../../third_party/aom/av1/common/entropymode.c',
    '../../third_party/aom/av1/common/entropymv.c',
-    '../../third_party/aom/av1/common/filter.c',
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
@ -200,6 +201,8 @@ files = {
    '../../third_party/aom/av1/decoder/detokenize.c',
    '../../third_party/aom/av1/decoder/dthread.c',
    '../../third_party/aom/av1/decoder/obu.c',
+    '../../third_party/aom/stats/aomstats.c',
+    '../../third_party/aom/stats/rate_hist.c',
  ],
  'IA32_EXPORTS': [
    '../../third_party/aom/aom/aom.h',
@ -298,7 +301,6 @@ files = {
    '../../third_party/aom/av1/common/entropy.c',
    '../../third_party/aom/av1/common/entropymode.c',
    '../../third_party/aom/av1/common/entropymv.c',
-    '../../third_party/aom/av1/common/filter.c',
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
@ -361,6 +363,8 @@ files = {
    '../../third_party/aom/av1/decoder/detokenize.c',
    '../../third_party/aom/av1/decoder/dthread.c',
    '../../third_party/aom/av1/decoder/obu.c',
+    '../../third_party/aom/stats/aomstats.c',
+    '../../third_party/aom/stats/rate_hist.c',
  ],
  'X64_EXPORTS': [
    '../../third_party/aom/aom/aom.h',
@ -458,7 +462,6 @@ files = {
    '../../third_party/aom/av1/common/entropy.c',
    '../../third_party/aom/av1/common/entropymode.c',
    '../../third_party/aom/av1/common/entropymv.c',
-    '../../third_party/aom/av1/common/filter.c',
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
@ -521,5 +524,7 @@ files = {
    '../../third_party/aom/av1/decoder/detokenize.c',
    '../../third_party/aom/av1/decoder/dthread.c',
    '../../third_party/aom/av1/decoder/obu.c',
+    '../../third_party/aom/stats/aomstats.c',
+    '../../third_party/aom/stats/rate_hist.c',
  ],
 }
--- a/third_party/aom/.cmake-format.py
+++ b/third_party/aom/.cmake-format.py
@ -1,3 +1,4 @@
+# Generated with cmake-format 0.3.6
 # How wide to allow formatted cmake files
 line_width = 80

--- a/third_party/aom/CHANGELOG
+++ b/third_party/aom/CHANGELOG
@ -1,631 +1,5 @@
-Next Release
-  - Incompatible changes:
-    The AV1 encoder's default keyframe interval changed to 128 from 9999.
-    Support for armv6 was removed.
+2018-06-28 v1.0.0
+  AOMedia Codec Workgroup Approved version 1.0

 2016-04-07 v0.1.0 "AOMedia Codec 1"
  This release is the first Alliance for Open Media codec.
-2015-11-09 v1.5.0 "Javan Whistling Duck"
-  This release improves upon the VP9 encoder and speeds up the encoding and
-  decoding processes.
-
-  - Upgrading:
-    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
-    controls and adds a variety of VP9 controls for testing.
-
-    The vpxenc utility now prefers VP9 by default.
-
-  - Enhancements:
-    Faster VP9 encoding and decoding
-    Smaller library size by combining functions used by VP8 and VP9
-
-  - Bug Fixes:
-    A variety of fuzzing issues
-
-2015-04-03 v1.4.0 "Indian Runner Duck"
-  This release includes significant improvements to the VP9 codec.
-
-  - Upgrading:
-    This release is ABI incompatible with 1.3.0. It drops the compatibility
-    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
-    controls for VP9.
-
-  - Enhancements:
-    Faster VP9 encoding and decoding
-    Multithreaded VP9 decoding (tile and frame-based)
-    Multithreaded VP9 encoding - on by default
-    YUV 4:2:2 and 4:4:4 support in VP9
-    10 and 12bit support in VP9
-    64bit ARM support by replacing ARM assembly with intrinsics
-
-  - Bug Fixes:
-    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
-    files.
-
-  - Known Issues:
-    Frame Parallel decoding fails for segmented and non-420 files.
-
-2013-11-15 v1.3.0 "Forest"
-  This release introduces the VP9 codec in a backward-compatible way.
-  All existing users of VP8 can continue to use the library without
-  modification. However, some VP8 options do not map to VP9 in the same manner.
-
-  The VP9 encoder in this release is not feature complete. Users interested in
-  the encoder are advised to use the git master branch and discuss issues on
-  libvpx mailing lists.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this document
-    for that release.
-
-  - Enhancements:
-      Get rid of bashisms in the main build scripts
-      Added usage info on command line options
-      Add lossless compression mode
-      Dll build of libvpx
-      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
-      Add option to disable documentation
-      configure: add --enable-external-build support
-      make: support V=1 as short form of verbose=yes
-      configure: support mingw-w64
-      configure: support hardfloat armv7 CHOSTS
-      configure: add support for android x86
-      Add estimated completion time to vpxenc
-      Don't exit on decode errors in vpxenc
-      vpxenc: support scaling prior to encoding
-      vpxdec: support scaling output
-      vpxenc: improve progress indicators with --skip
-      msvs: Don't link to winmm.lib
-      Add a new script for producing vcxproj files
-      Produce Visual Studio 10 and 11 project files
-      Produce Windows Phone project files
-      msvs-build: use msbuild for vs >= 2005
-      configure: default configure log to config.log
-      Add encoding option --static-thresh
-
-  - Speed:
-      Miscellaneous speed optimizations for VP8 and VP9.
-
-  - Quality:
-      In general, quality is consistent with the Eider release.
-
-  - Bug Fixes:
-      This release represents approximately a year of engineering effort,
-      and contains multiple bug fixes. Please refer to git history for details.
-
-
-2012-12-21 v1.2.0
-  This release acts as a checkpoint for a large amount of internal refactoring
-  and testing. It also contains a number of small bugfixes, so all users are
-  encouraged to upgrade.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-  - Enhancements:
-      VP8 optimizations for MIPS dspr2
-      vpxenc: add -quiet option
-
-  - Speed:
-      Encoder and decoder speed is consistent with the Eider release.
-
-  - Quality:
-      In general, quality is consistent with the Eider release.
-
-      Minor tweaks to ARNR filtering
-      Minor improvements to real time encoding with multiple temporal layers
-
-  - Bug Fixes:
-      Fixes multithreaded encoder race condition in loopfilter
-      Fixes multi-resolution threaded encoding
-      Fix potential encoder dead-lock after picture resize
-
-
-2012-05-09 v1.1.0 "Eider"
-  This introduces a number of enhancements, mostly focused on real-time
-  encoding. In addition, it fixes a decoder bug (first introduced in
-  Duclair) so all users of that release are encouraged to upgrade.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-    This release introduces a new temporal denoiser, controlled by the
-    VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
-    currently take a strength parameter, so the control is effectively
-    a boolean - zero (off) or non-zero (on). For compatibility with
-    existing applications, the values accepted are the same as those
-    for the spatial denoiser (0-6). The temporal denoiser is enabled
-    by default, and the older spatial denoiser may be restored by
-    configuring with --disable-temporal-denoising. The temporal denoiser
-    is more computationally intensive than the spatial one.
-
-    This release removes support for a legacy, decode only API that was
-    supported, but deprecated, at the initial release of libvpx
-    (v0.9.0). This is not expected to have any impact. If you are
-    impacted, you can apply a reversion to commit 2bf8fb58 locally.
-    Please update to the latest libvpx API if you are affected.
-
-  - Enhancements:
-      Adds a motion compensated temporal denoiser to the encoder, which
-      gives higher quality than the older spatial denoiser. (See above
-      for notes on upgrading).
-
-      In addition, support for new compilers and platforms were added,
-      including:
-        improved support for XCode
-        Android x86 NDK build
-        OS/2 support
-        SunCC support
-
-      Changing resolution with vpx_codec_enc_config_set() is now
-      supported. Previously, reinitializing the codec was required to
-      change the input resolution.
-
-      The vpxenc application has initial support for producing multiple
-      encodes from the same input in one call. Resizing is not yet
-      supported, but varying other codec parameters is. Use -- to
-      delineate output streams. Options persist from one stream to the
-      next.
-
-      Also, the vpxenc application will now use a keyframe interval of
-      5 seconds by default. Use the --kf-max-dist option to override.
-
-  - Speed:
-      Decoder performance improved 2.5% versus Duclair. Encoder speed is
-      consistent with Duclair for most material. Two pass encoding of
-      slideshow-like material will see significant improvements.
-
-      Large realtime encoding speed gains at a small quality expense are
-      possible by configuring the on-the-fly bitpacking experiment with
-      --enable-onthefly-bitpacking. Realtime encoder can be up to 13%
-      faster (ARM) depending on the number of threads and bitrate
-      settings. This technique sees constant gain over the 5-16 speed
-      range. For VC style input the loss seen is up to 0.2dB. See commit
-      52cf4dca for further details.
-
-  - Quality:
-      On the whole, quality is consistent with the Duclair release. Some
-      tweaks:
-
-        Reduced blockiness in easy sections by applying a penalty to
-        intra modes.
-
-        Improved quality of static sections (like slideshows) with
-        two pass encoding.
-
-        Improved keyframe sizing with multiple temporal layers
-
-  - Bug Fixes:
-      Corrected alt-ref contribution to frame rate for visible updates
-      to the alt-ref buffer. This affected applications making manual
-      usage of the frame reference flags, or temporal layers.
-
-      Additional constraints were added to disable multi-frame quality
-      enhancement (MFQE) in sections of the frame where there is motion.
-      (#392)
-
-      Fixed corruption issues when vpx_codec_enc_config_set() was called
-      with spatial resampling enabled.
-
-      Fixed a decoder error introduced in Duclair where the segmentation
-      map was not being reinitialized on keyframes (#378)
-
-
-2012-01-27 v1.0.0 "Duclair"
-  Our fourth named release, focused on performance and features related to
-  real-time encoding. It also fixes a decoder crash bug introduced in
-  v0.9.7, so all users of that release are encouraged to upgrade.
-
-  - Upgrading:
-      This release is ABI incompatible with prior releases of libvpx, so the
-      "major" version number has been bumped to 1. You must recompile your
-      applications against the latest version of the libvpx headers. The
-      API remains compatible, and this should not require code changes in most
-      applications.
-
-  - Enhancements:
-      This release introduces several substantial new features to the encoder,
-      of particular interest to real time streaming applications.
-
-      Temporal scalability allows the encoder to produce a stream that can
-      be decimated to different frame rates, with independent rate targetting
-      for each substream.
-
-      Multiframe quality enhancement postprocessing can make visual quality
-      more consistent in the presence of frames that are substantially
-      different quality than the surrounding frames, as in the temporal
-      scalability case and in some forced keyframe scenarios.
-
-      Multiple-resolution encoding support allows the encoding of the
-      same content at different resolutions faster than encoding them
-      separately.
-
-  - Speed:
-      Optimization targets for this release included the decoder and the real-
-      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
-      this release. Encoder improvements followed a curve where speeds 1-3
-      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
-      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
-      Cayuga release.
-
-  - Quality:
-      Encoder quality in the single stream case is consistent with the Cayuga
-      release.
-
-  - Bug Fixes:
-      This release fixes an OOB read decoder crash bug present in v0.9.7
-      related to the clamping of motion vectors in SPLITMV blocks. This
-      behavior could be triggered by corrupt input or by starting
-      decoding from a P-frame.
-
-
-2011-08-15 v0.9.7-p1 "Cayuga" patch 1
-  This is an incremental bugfix release against Cayuga. All users of that
-  release are strongly encouraged to upgrade.
-
-    - Fix potential OOB reads (cdae03a)
-
-          An unbounded out of bounds read was discovered when the
-          decoder was requested to perform error concealment (new in
-          Cayuga) given a frame with corrupt partition sizes.
-
-          A bounded out of bounds read was discovered affecting all
-          versions of libvpx. Given an multipartition input frame that
-          is truncated between the mode/mv partition and the first
-          residiual paritition (in the block of partition offsets), up
-          to 3 extra bytes could have been read from the source buffer.
-          The code will not take any action regardless of the contents
-          of these undefined bytes, as the truncated buffer is detected
-          immediately following the read based on the calculated
-          starting position of the coefficient partition.
-
-    - Fix potential error concealment crash when the very first frame
-      is missing or corrupt (a609be5)
-
-    - Fix significant artifacts in error concealment (a4c2211, 99d870a)
-
-    - Revert 1-pass CBR rate control changes (e961317)
-      Further testing showed this change produced undesirable visual
-      artifacts, rolling back for now.
-
-
-2011-08-02 v0.9.7 "Cayuga"
-  Our third named release, focused on a faster, higher quality, encoder.
-
-  - Upgrading:
-    This release is backwards compatible with Aylesbury (v0.9.5) and
-    Bali (v0.9.6). Users of older releases should refer to the Upgrading
-    notes in this document for that release.
-
-  - Enhancements:
-          Stereo 3D format support for vpxenc
-          Runtime detection of available processor cores.
-          Allow specifying --end-usage by enum name
-          vpxdec: test for frame corruption
-          vpxenc: add quantizer histogram display
-          vpxenc: add rate histogram display
-          Set VPX_FRAME_IS_DROPPABLE
-          update configure for ios sdk 4.3
-          Avoid text relocations in ARM vp8 decoder
-          Generate a vpx.pc file for pkg-config.
-          New ways of passing encoded data between encoder and decoder.
-
-  - Speed:
-      This release includes across-the-board speed improvements to the
-      encoder. On x86, these measure at approximately 11.5% in Best mode,
-      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
-      On ARM Cortex A9 with Neon extensions, real-time encoding of video
-      telephony content is 35% faster than Bali on single core and 48%
-      faster on multi-core. On the NVidia Tegra2 platform, real time
-      encoding is 40% faster than Bali.
-
-      Decoder speed was not a priority for this release, but improved
-      approximately 8.4% on x86.
-
-          Reduce motion vector search on alt-ref frame.
-          Encoder loopfilter running in its own thread
-          Reworked loopfilter to precalculate more parameters
-          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
-          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
-          Removed redundant checks
-          Reduced structure sizes
-          utilize preload in ARMv6 MC/LPF/Copy routines
-          ARM optimized quantization, dfct, variance, subtract
-          Increase chrow row alignment to 16 bytes.
-          disable trellis optimization for first pass
-          Write SSSE3 sub-pixel filter function
-          Improve SSE2 half-pixel filter funtions
-          Add vp8_sub_pixel_variance16x8_ssse3 function
-          Reduce unnecessary distortion computation
-          Use diamond search to replace full search
-          Preload reference area in sub-pixel motion search (real-time mode)
-
-  - Quality:
-      This release focused primarily on one-pass use cases, including
-      video conferencing. Low latency data rate control was significantly
-      improved, improving streamability over bandwidth constrained links.
-      Added support for error concealment, allowing frames to maintain
-      visual quality in the presence of substantial packet loss.
-
-          Add rc_max_intra_bitrate_pct control
-          Limit size of initial keyframe in one-pass.
-          Improve framerate adaptation
-          Improved 1-pass CBR rate control
-          Improved KF insertion after fades to still.
-          Improved key frame detection.
-          Improved activity masking (lower PSNR impact for same SSIM boost)
-          Improved interaction between GF and ARFs
-          Adding error-concealment to the decoder.
-          Adding support for independent partitions
-          Adjusted rate-distortion constants
-
-
-  - Bug Fixes:
-          Removed firstpass motion map
-          Fix parallel make install
-          Fix multithreaded encoding for 1 MB wide frame
-          Fixed iwalsh_neon build problems with RVDS4.1
-          Fix semaphore emulation, spin-wait intrinsics on Windows
-          Fix build with xcode4 and simplify GLOBAL.
-          Mark ARM asm objects as allowing a non-executable stack.
-          Fix vpxenc encoding incorrect webm file header on big endian
-
-
-2011-03-07 v0.9.6 "Bali"
-  Our second named release, focused on a faster, higher quality, encoder.
-
-  - Upgrading:
-    This release is backwards compatible with Aylesbury (v0.9.5). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-  - Enhancements:
-      vpxenc --psnr shows a summary when encode completes
-      --tune=ssim option to enable activity masking
-      improved postproc visualizations for development
-      updated support for Apple iOS to SDK 4.2
-      query decoder to determine which reference frames were updated
-      implemented error tracking in the decoder
-      fix pipe support on windows
-
-  - Speed:
-      Primary focus was on good quality mode, speed 0. Average improvement
-      on x86 about 40%, up to 100% on user-generated content at that speed.
-      Best quality mode speed improved 35%, and realtime speed 10-20%. This
-      release also saw significant improvement in realtime encoding speed
-      on ARM platforms.
-
-        Improved encoder threading
-        Dont pick encoder filter level when loopfilter is disabled.
-        Avoid double copying of key frames into alt and golden buffer
-        FDCT optimizations.
-        x86 sse2 temporal filter
-        SSSE3 version of fast quantizer
-        vp8_rd_pick_best_mbsegmentation code restructure
-        Adjusted breakout RD for SPLITMV
-        Changed segmentation check order
-        Improved rd_pick_intra4x4block
-        Adds armv6 optimized variance calculation
-        ARMv6 optimized sad16x16
-        ARMv6 optimized half pixel variance calculations
-        Full search SAD function optimization in SSE4.1
-        Improve MV prediction accuracy to achieve performance gain
-        Improve MV prediction in vp8_pick_inter_mode() for speed>3
-
-  - Quality:
-      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
-      also includes support for "activity masking," which greatly improves
-      SSIM at the expense of PSNR. For now, this feature is available with
-      the --tune=ssim option. Further experimentation in this area
-      is ongoing. This release also introduces a new rate control mode
-      called "CQ," which changes the allocation of bits within a clip to
-      the sections where they will have the most visual impact.
-
-        Tuning for the more exact quantizer.
-        Relax rate control for last few frames
-        CQ Mode
-        Limit key frame quantizer for forced key frames.
-        KF/GF Pulsing
-        Add simple version of activity masking.
-        make rdmult adaptive for intra in quantizer RDO
-        cap the best quantizer for 2nd order DC
-        change the threshold of DC check for encode breakout
-
-  - Bug Fixes:
-      Fix crash on Sparc Solaris.
-      Fix counter of fixed keyframe distance
-      ARNR filter pointer update bug fix
-      Fixed use of motion percentage in KF/GF group calc
-      Changed condition for using RD in Intra Mode
-      Fix encoder real-time only configuration.
-      Fix ARM encoder crash with multiple token partitions
-      Fixed bug first cluster timecode of webm file is wrong.
-      Fixed various encoder bugs with odd-sized images
-      vp8e_get_preview fixed when spatial resampling enabled
-      quantizer: fix assertion in fast quantizer path
-      Allocate source buffers to be multiples of 16
-      Fix for manual Golden frame frequency
-      Fix drastic undershoot in long form content
-
-
-2010-10-28 v0.9.5 "Aylesbury"
-  Our first named release, focused on a faster decoder, and a better encoder.
-
-  - Upgrading:
-    This release incorporates backwards-incompatible changes to the
-    ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
-
-    vpxdec
-      * the -q (quiet) option has been removed, and replaced with
-        -v (verbose). the output is quiet by default. Use -v to see
-        the version number of the binary.
-
-      * The default behavior is now to write output to a single file
-        instead of individual frames. The -y option has been removed.
-        Y4M output is the default.
-
-      * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
-        options must be specified.
-
-          $ ivfdec -o OUTPUT INPUT
-          $ vpxdec --i420 -o OUTPUT INPUT
-
-      * If an output file is not specified, the default is to write
-        Y4M to stdout. This makes piping more natural.
-
-          $ ivfdec -y -o - INPUT | ...
-          $ vpxdec INPUT | ...
-
-      * The output file has additional flexibility for formatting the
-        filename. It supports escape characters for constructing a
-        filename from the width, height, and sequence number. This
-        replaces the -p option. To get the equivalent:
-
-          $ ivfdec -p frame INPUT
-          $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
-
-    vpxenc
-      * The output file must be specified with -o, rather than as the
-        last argument.
-
-          $ ivfenc <options> INPUT OUTPUT
-          $ vpxenc <options> -o OUTPUT INPUT
-
-      * The output defaults to webm. To get IVF output, use the --ivf
-        option.
-
-          $ ivfenc <options> INPUT OUTPUT.ivf
-          $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
-
-
-  - Enhancements:
-      ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
-      vpxdec supports .webm input
-      vpxdec writes .y4m by default
-      vpxenc writes .webm output by default
-      vpxenc --psnr now shows the average/overall PSNR at the end
-      ARM platforms now support runtime cpu detection
-      vpxdec visualizations added for motion vectors, block modes, references
-      vpxdec now silent by default
-      vpxdec --progress shows frame-by-frame timing information
-      vpxenc supports the distinction between --fps and --timebase
-      NASM is now a supported assembler
-      configure: enable PIC for shared libs by default
-      configure: add --enable-small
-      configure: support for ppc32-linux-gcc
-      configure: support for sparc-solaris-gcc
-
-  - Bugs:
-      Improve handling of invalid frames
-      Fix valgrind errors in the NEON loop filters.
-      Fix loopfilter delta zero transitions
-      Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
-      Build fixes for darwin-icc
-
-  - Speed:
-      20-40% (average 28%) improvement in libvpx decoder speed,
-      including:
-        Rewrite vp8_short_walsh4x4_sse2()
-        Optimizations on the loopfilters.
-        Miscellaneous improvements for Atom
-        Add 4-tap version of 2nd-pass ARMv6 MC filter.
-        Improved multithread utilization
-        Better instruction choices on x86
-        reorder data to use wider instructions
-        Update NEON wide idcts
-        Make block access to frame buffer sequential
-        Improved subset block search
-        Bilinear subpixel optimizations for ssse3.
-        Decrease memory footprint
-
-      Encoder speed improvements (percentage gain not measured):
-        Skip unnecessary search of identical frames
-        Add SSE2 subtract functions
-        Improve bounds checking in vp8_diamond_search_sadx4()
-        Added vp8_fast_quantize_b_sse2
-
-  - Quality:
-      Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
-      encoding mode, and up to 60% improvement on very noisy, still
-      or slow moving source video
-
-        Motion compensated temporal filter for Alt-Ref Noise Reduction
-        Improved use of trellis quantization on 2nd order Y blocks
-        Tune effect of motion on KF/GF boost in two pass
-        Allow coefficient optimization for good quality speed 0.
-        Improved control of active min quantizer for two pass.
-        Enable ARFs for non-lagged compress
-
-2010-09-02 v0.9.2
-  - Enhancements:
-      Disable frame dropping by default
-      Improved multithreaded performance
-      Improved Force Key Frame Behaviour
-      Increased rate control buffer level precision
-      Fix bug in 1st pass motion compensation
-      ivfenc: correct fixed kf interval, --disable-kf
-  - Speed:
-      Changed above and left context data layout
-      Rework idct calling structure.
-      Removed unnecessary MB_MODE_INFO copies
-      x86: SSSE3 sixtap prediction
-      Reworked IDCT to include reconstruction (add) step
-      Swap alt/gold/new/last frame buffer ptrs instead of copying.
-      Improve SSE2 loopfilter functions
-      Change bitreader to use a larger window.
-      Avoid loopfilter reinitialization when possible
-  - Quality:
-      Normalize quantizer's zero bin and rounding factors
-      Add trellis quantization.
-      Make the quantizer exact.
-      Updates to ARNR filtering algorithm
-      Fix breakout thresh computation for golden & AltRef frames
-      Redo the forward 4x4 dct
-      Improve the accuracy of forward walsh-hadamard transform
-      Further adjustment of RD behaviour with Q and Zbin.
-  - Build System:
-      Allow linking of libs built with MinGW to MSVC
-      Fix target auto-detection on mingw32
-      Allow --cpu= to work for x86.
-      configure: pass original arguments through to make dist
-      Fix builds without runtime CPU detection
-      msvs: fix install of codec sources
-      msvs: Change devenv.com command line for better msys support
-      msvs: Add vs9 targets.
-      Add x86_64-linux-icc target
-  - Bugs:
-      Potential crashes on older MinGW builds
-      Fix two-pass framrate for Y4M input.
-      Fixed simple loop filter, other crashes on ARM v6
-      arm: fix missing dependency with --enable-shared
-      configure: support directories containing .o
-      Replace pinsrw (SSE) with MMX instructions
-      apple: include proper mach primatives
-      Fixed rate control bug with long key frame interval.
-      Fix DSO link errors on x86-64 when not using a version script
-      Fixed buffer selection for UV in AltRef filtering
-
-
-2010-06-17 v0.9.1
-  - Enhancements:
-      * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
-      * Speed optimizations
-  - Bugfixes:
-      * Rate control
-      * Prevent out-of-bounds accesses on invalid data
-  - Build system updates:
-      * Detect toolchain to be used automatically for native builds
-      * Support building shared libraries
-      * Better autotools emulation (--prefix, --libdir, DESTDIR)
-  - Updated LICENSE
-      * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html
-
-
-2010-05-18 v0.9.0
-  - Initial open source release. Welcome to WebM and VP8!
-
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@ -186,11 +186,9 @@ list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
            "${AOM_ROOT}/examples/encoder_util.h"
            "${AOM_ROOT}/examples/encoder_util.c")

-if (ENABLE_EXAMPLES)
-  list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
-              "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
-              "${AOM_ROOT}/stats/rate_hist.h")
-endif ()
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+            "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+            "${AOM_ROOT}/stats/rate_hist.h")

 list(APPEND AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")

--- a/third_party/aom/aom/aomcx.h
+++ b/third_party/aom/aom/aomcx.h
@ -854,6 +854,12 @@ enum aome_enc_control_id {
  /*!\brief Codec control function to set the path to the film grain parameters
   */
  AV1E_SET_FILM_GRAIN_TABLE,
+
+  /*!\brief Sets the noise level */
+  AV1E_SET_DENOISE_NOISE_LEVEL,
+
+  /*!\brief Sets the denoisers block size */
+  AV1E_SET_DENOISE_BLOCK_SIZE,
 };

 /*!\brief aom 1-D scaling mode
@ -1165,6 +1171,14 @@ AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
 AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
 #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE

+#ifdef CONFIG_DENOISE
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int);
+#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int);
+#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
+#endif
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
--- a/third_party/aom/aom/aomdx.h
+++ b/third_party/aom/aom/aomdx.h
@ -119,6 +119,12 @@ enum aom_dec_control_id {
  /** control function to get the bit depth of the stream. */
  AV1D_GET_BIT_DEPTH,

+  /** control function to get the image format of the stream. */
+  AV1D_GET_IMG_FORMAT,
+
+  /** control function to get the size of the tile. */
+  AV1D_GET_TILE_SIZE,
+
  /** control function to set the byte alignment of the planes in the reference
   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
@ -187,6 +193,12 @@ enum aom_dec_control_id {
   */
  AV1D_EXT_TILE_DEBUG,

+  /** control function to enable the row based multi-threading of decoding. A
+   * value that is equal to 1 indicates that row based multi-threading is
+   * enabled.
+   */
+  AV1D_SET_ROW_MT,
+
  /** control function to indicate whether bitstream is in Annex-B format. */
  AV1D_SET_IS_ANNEXB,

@ -238,6 +250,10 @@ AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
 #define AOM_CTRL_AV1D_GET_BIT_DEPTH
+AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
+#define AOM_CTRL_AV1D_GET_IMG_FORMAT
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_FRAME_SIZE
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
@ -258,6 +274,8 @@ AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
 #define AOM_CTRL_AV1D_SET_EXT_REF_PTR
 AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
 #define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1D_SET_ROW_MT
 AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
 #define AOM_CTRL_AV1D_SET_IS_ANNEXB
 AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
--- a/third_party/aom/aom/internal/aom_codec_internal.h
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@ -417,7 +417,7 @@ struct aom_internal_error_info {
  aom_codec_err_t error_code;
  int has_detail;
  char detail[80];
-  int setjmp;
+  int setjmp;  // Boolean: whether 'jmp' is valid.
  jmp_buf jmp;
 };

--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@ -83,6 +83,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
@ -190,13 +191,16 @@ if(CONFIG_AV1_ENCODER)
              "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")

  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c")

  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
@ -205,9 +209,11 @@ if(CONFIG_AV1_ENCODER)
              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")

  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")

--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@ -15,4 +15,4 @@

 #include "aom_ports/aom_once.h"

-void aom_dsp_rtcd() { once(setup_rtcd_internal); }
+void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@ -377,7 +377,7 @@ add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8
 specialize qw/aom_lpf_vertical_14_dual sse2/;

 add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2/;
+specialize qw/aom_lpf_vertical_6 sse2 neon/;

 add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_8 sse2 neon/;
@ -386,13 +386,13 @@ add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_
 specialize qw/aom_lpf_vertical_8_dual sse2/;

 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2/;
+specialize qw/aom_lpf_vertical_4 sse2 neon/;

 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_vertical_4_dual sse2/;

 add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2/;
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;

 add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_14_dual sse2/;
@ -410,7 +410,7 @@ add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint
 specialize qw/aom_lpf_horizontal_8_dual sse2/;

 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2/;
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;

 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_4_dual sse2/;
@ -564,7 +564,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
  # Block subtraction
  #
  add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-  specialize qw/aom_subtract_block neon msa sse2/;
+  specialize qw/aom_subtract_block neon msa sse2 avx2/;

  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
  specialize qw/aom_highbd_subtract_block sse2/;
@ -732,14 +732,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
  foreach (@block_sizes) {
    ($w, $h) = @$_;
    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
-    specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
  }


    foreach (@block_sizes) {
      ($w, $h) = @$_;
      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
    }


@ -750,7 +750,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
    ($w, $h) = @$_;
    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
    }
  }

@ -759,7 +759,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
      ($w, $h) = @$_;
      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
      if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
      }
    }

@ -1102,6 +1102,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
    specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
  }


@ -1539,9 +1540,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
  specialize qw/aom_comp_mask_pred ssse3 avx2/;

  add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
-                                                           int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
-
+  specialize qw/aom_highbd_comp_mask_pred avx2/;

 }  # CONFIG_AV1_ENCODER

--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@ -528,3 +528,63 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    }
  }
 }
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       const uint16_t *above,
+                                       const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized_neon(type, width)               \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_##type##_predictor(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)           \
+  intra_pred_highbd_sized_neon(type, 4);  \
+  intra_pred_highbd_sized_neon(type, 8);  \
+  intra_pred_highbd_sized_neon(type, 16); \
+  intra_pred_highbd_sized_neon(type, 32); \
+  intra_pred_highbd_sized_neon(type, 64);
+
+intra_pred_square(dc);
+#undef intra_pred_square
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@ -52,6 +52,36 @@ static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
  return mask_8x8;
 }

+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+                                  const uint8_t blimit, const uint8_t limit) {
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  uint8x8_t mask_8x8, temp_8x8;
+
+  mask_8x8 = vabd_u8(p1q1, p0q0);
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
 static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
                                       uint8x8_t p1q1, uint8x8_t p0q0) {
  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
@ -523,6 +553,68 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
  }
 }

+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  int32x2x2_t ps0_qs0, ps1_qs1;
+  int16x8_t filter_s16;
+  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+  int8x8_t op0, oq0, op1, oq1;
+  int8x8_t pq_s0, pq_s1;
+  int8x8_t filter_s8, filter1_s8, filter2_s8;
+  int8x8_t hev_8x8;
+  const int8x8_t sign_mask = vdup_n_s8(0x80);
+  const int8x8_t val_4 = vdup_n_s8(4);
+  const int8x8_t val_3 = vdup_n_s8(3);
+
+  // Calculate filter mask
+  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+  // hev_mask
+  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+  // add outer taps if we have high edge variance
+  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+  // inner taps
+  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  filter_s16 = vmovl_s8(filter_s8);
+  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+  filter_s8 = vqmovn_s16(filter_s16);
+  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+  filter1_s8 = vqadd_s8(filter_s8, val_4);
+  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  filter1_s8 = vshr_n_s8(filter1_s8, 3);
+  filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+  filter_s8 = vrshr_n_s8(filter1_s8, 1);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
 void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
  uint8x16_t row0, row1, row2, row3;
@ -646,6 +738,125 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }

+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t pxq0, p2q1, p1q2, p0qy;
+  uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+  // row0: px p2 p1 p0 | q0 q1 q2 qy
+  // row1: px p2 p1 p0 | q0 q1 q2 qy
+  // row2: px p2 p1 p0 | q0 q1 q2 qy
+  // row3: px p2 p1 p0 | q0 q1 q2 qy
+  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+  uint32x2_t pq_rev;
+  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1;
+
+  // row0: p1 p0 | q0 q1
+  // row1: p1 p0 | q0 q1
+  // row2: p1 p0 | q0 q1
+  // row3: p1 p0 | q0 q1
+  load_u8_4x1(src - 2, &p1p0, 0);
+  load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
+  load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
+  load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  store_u8_4x1(src - 2, p1p0, 0);
+  store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
+  store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
+  store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
+
+  load_u8_4x1(src - 7 * stride, &p6q6, 0);
+  load_u8_4x1(src - 6 * stride, &p5q5, 0);
+  load_u8_4x1(src - 5 * stride, &p4q4, 0);
+  load_u8_4x1(src - 4 * stride, &p3q3, 0);
+  load_u8_4x1(src - 3 * stride, &p2q2, 0);
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  load_u8_4x1(src + 2 * stride, &p2q2, 1);
+  load_u8_4x1(src + 3 * stride, &p3q3, 1);
+  load_u8_4x1(src + 4 * stride, &p4q4, 1);
+  load_u8_4x1(src + 5 * stride, &p5q5, 1);
+  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  store_u8_4x1(src - 6 * stride, p5q5, 0);
+  store_u8_4x1(src - 5 * stride, p4q4, 0);
+  store_u8_4x1(src - 4 * stride, p3q3, 0);
+  store_u8_4x1(src - 3 * stride, p2q2, 0);
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+  store_u8_4x1(src + 2 * stride, p2q2, 1);
+  store_u8_4x1(src + 3 * stride, p3q3, 1);
+  store_u8_4x1(src + 4 * stride, p4q4, 1);
+  store_u8_4x1(src + 5 * stride, p5q5, 1);
+}
+
 void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
  uint8x8_t p0q0, p1q1, p2q2, p3q3;
@ -698,3 +909,20 @@ void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
 }
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
+
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+}
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@ -8,11 +8,14 @@
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
+
+#include <assert.h>
+
 #include "config/aom_config.h"

 #include "aom_dsp/bitreader_buffer.h"

-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
  return (rb->bit_offset + 7) >> 3;
 }

@ -31,6 +34,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
 }

 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  assert(bits <= 31);
  int value = 0, bit;
  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
  return value;
@ -38,6 +42,7 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {

 uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
                                      int bits) {
+  assert(bits <= 32);
  uint32_t value = 0;
  int bit;
  for (bit = bits - 1; bit >= 0; bit--)
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@ -31,7 +31,7 @@ struct aom_read_bit_buffer {
  aom_rb_error_handler error_handler;
 };

-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);

 int aom_rb_read_bit(struct aom_read_bit_buffer *rb);

--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@ -9,6 +9,7 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

+#include <assert.h>
 #include <limits.h>
 #include <stdlib.h>

@ -49,12 +50,14 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
 }

 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  assert(bits <= 31);
  int bit;
  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }

 void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
                                   uint32_t data, int bits) {
+  assert(bits <= 32);
  int bit;
  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
--- a/third_party/aom/aom_dsp/grain_synthesis.c
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@ -17,6 +17,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
 #include "aom_dsp/grain_synthesis.h"
 #include "aom_mem/aom_mem.h"

@ -237,7 +238,7 @@ static int grain_max;

 static uint16_t random_register = 0;  // random number generator register

-static void init_arrays(aom_film_grain_t *params, int luma_stride,
+static void init_arrays(const aom_film_grain_t *params, int luma_stride,
                        int chroma_stride, int ***pred_pos_luma_p,
                        int ***pred_pos_chroma_p, int **luma_grain_block,
                        int **cb_grain_block, int **cr_grain_block,
@ -331,7 +332,7 @@ static void init_arrays(aom_film_grain_t *params, int luma_stride,
      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
 }

-static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma,
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
                           int ***pred_pos_chroma, int **luma_grain_block,
                           int **cb_grain_block, int **cr_grain_block,
                           int **y_line_buf, int **cb_line_buf,
@ -396,10 +397,14 @@ static void init_random_generator(int luma_line, uint16_t seed) {
 }

 static void generate_luma_grain_block(
-    aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
    int left_pad, int top_pad, int right_pad, int bottom_pad) {
-  if (params->num_y_points == 0) return;
+  if (params->num_y_points == 0) {
+    memset(luma_grain_block, 0,
+           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
+    return;
+  }

  int bit_depth = params->bit_depth;
  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
@ -431,7 +436,7 @@ static void generate_luma_grain_block(
 }

 static void generate_chroma_grain_blocks(
-    aom_film_grain_t *params,
+    const aom_film_grain_t *params,
    //                                  int** pred_pos_luma,
    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
@ -443,7 +448,7 @@ static void generate_chroma_grain_blocks(
  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
  if (params->num_y_points > 0) ++num_pos_chroma;
  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
-  int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x;
+  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;

  if (params->num_cb_points || params->chroma_scaling_from_luma) {
    init_random_generator(7 << 5, params->random_seed);
@ -455,7 +460,8 @@ static void generate_chroma_grain_blocks(
             ((1 << gauss_sec_shift) >> 1)) >>
            gauss_sec_shift;
  } else {
-    memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples);
+    memset(cb_grain_block, 0,
+           sizeof(*cb_grain_block) * chroma_grain_block_size);
  }

  if (params->num_cr_points || params->chroma_scaling_from_luma) {
@ -468,7 +474,8 @@ static void generate_chroma_grain_blocks(
             ((1 << gauss_sec_shift) >> 1)) >>
            gauss_sec_shift;
  } else {
-    memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples);
+    memset(cr_grain_block, 0,
+           sizeof(*cr_grain_block) * chroma_grain_block_size);
  }

  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
@ -522,7 +529,7 @@ static void generate_chroma_grain_blocks(
    }
 }

-static void init_scaling_function(int scaling_points[][2], int num_points,
+static void init_scaling_function(const int scaling_points[][2], int num_points,
                                  int scaling_lut[]) {
  if (num_points == 0) return;

@ -559,7 +566,7 @@ static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
                             (bit_depth - 8));
 }

-static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
+static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
                               uint8_t *cb, uint8_t *cr, int luma_stride,
                               int chroma_stride, int *luma_grain,
                               int *cb_grain, int *cr_grain,
@ -675,7 +682,7 @@ static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
 }

 static void add_noise_to_block_hbd(
-    aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
    int half_luma_height, int half_luma_width, int bit_depth,
@ -903,7 +910,7 @@ static void hor_boundary_overlap(int *top_block, int top_stride,
  }
 }

-void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
+void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
                        aom_image_t *dst) {
  uint8_t *luma, *cb, *cr;
  int height, width, luma_stride, chroma_stride;
@ -950,6 +957,11 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
      exit(1);
  }

+  assert(params->bit_depth == src->bit_depth);
+
+  dst->fmt = src->fmt;
+  dst->bit_depth = src->bit_depth;
+
  dst->r_w = src->r_w;
  dst->r_h = src->r_h;
  dst->d_w = src->d_w;
@ -999,15 +1011,13 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;

-  params->bit_depth = dst->bit_depth;
-
  av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
                         chroma_stride, use_high_bit_depth, chroma_subsamp_y,
                         chroma_subsamp_x, mc_identity);
  return;
 }

-void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma,
+void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
                            uint8_t *cb, uint8_t *cr, int height, int width,
                            int luma_stride, int chroma_stride,
                            int use_high_bit_depth, int chroma_subsamp_y,
--- a/third_party/aom/aom_dsp/grain_synthesis.h
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@ -72,7 +72,7 @@ typedef struct {

  int clip_to_restricted_range;

-  int bit_depth;  // video bit depth
+  unsigned int bit_depth;  // video bit depth

  int chroma_scaling_from_luma;

@ -94,7 +94,7 @@ typedef struct {
 * \param[in]    luma_stride      luma plane stride
 * \param[in]    chroma_stride    chroma plane stride
 */
-void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
+void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
                            uint8_t *cb, uint8_t *cr, int height, int width,
                            int luma_stride, int chroma_stride,
                            int use_high_bit_depth, int chroma_subsamp_y,
@ -106,10 +106,10 @@ void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
 *
 * \param[in]    grain_params     Grain parameters
 * \param[in]    src              Source image
- * \param[in]    dst              Resulting image with grain
+ * \param[out]   dst              Resulting image with grain
 */
-void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src,
-                        aom_image_t *dst);
+void av1_add_film_grain(const aom_film_grain_t *grain_params,
+                        const aom_image_t *src, aom_image_t *dst);

 #ifdef __cplusplus
 }  // extern "C"
--- a/third_party/aom/aom_dsp/noise_model.c
+++ b/third_party/aom/aom_dsp/noise_model.c
@ -1458,3 +1458,189 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
  }
  return init_success;
 }
+
+struct aom_denoise_and_model_t {
+  int block_size;
+  int bit_depth;
+  float noise_level;
+
+  // Size of current denoised buffer and flat_block buffer
+  int width;
+  int height;
+  int y_stride;
+  int uv_stride;
+  int num_blocks_w;
+  int num_blocks_h;
+
+  // Buffers for image and noise_psd allocated on the fly
+  float *noise_psd[3];
+  uint8_t *denoised[3];
+  uint8_t *flat_blocks;
+
+  aom_flat_block_finder_t flat_block_finder;
+  aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level) {
+  struct aom_denoise_and_model_t *ctx =
+      (struct aom_denoise_and_model_t *)aom_malloc(
+          sizeof(struct aom_denoise_and_model_t));
+  if (!ctx) {
+    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+    return NULL;
+  }
+  memset(ctx, 0, sizeof(*ctx));
+
+  ctx->block_size = block_size;
+  ctx->noise_level = noise_level;
+  ctx->bit_depth = bit_depth;
+
+  ctx->noise_psd[0] =
+      aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+  ctx->noise_psd[1] =
+      aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+  ctx->noise_psd[2] =
+      aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+    aom_denoise_and_model_free(ctx);
+    return NULL;
+  }
+  return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+  aom_free(ctx->flat_blocks);
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    aom_free(ctx->noise_psd[i]);
+  }
+  aom_noise_model_free(&ctx->noise_model);
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+    return 1;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  const int block_size = ctx->block_size;
+
+  ctx->width = sd->y_width;
+  ctx->height = sd->y_height;
+  ctx->y_stride = sd->y_stride;
+  ctx->uv_stride = sd->uv_stride;
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    ctx->denoised[i] = NULL;
+  }
+  aom_free(ctx->flat_blocks);
+  ctx->flat_blocks = NULL;
+
+  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+    fprintf(stderr, "Unable to allocate denoise buffers\n");
+    return 0;
+  }
+  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+  ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+                                  ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to init flat block finder\n");
+    return 0;
+  }
+
+  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                            ctx->bit_depth, use_highbd };
+  aom_noise_model_free(&ctx->noise_model);
+  if (!aom_noise_model_init(&ctx->noise_model, params)) {
+    fprintf(stderr, "Unable to init noise model\n");
+    return 0;
+  }
+
+  // Simply use a flat PSD (although we could use the flat blocks to estimate
+  // PSD) those to estimate an actual noise PSD)
+  const float y_noise_level =
+      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+  const float uv_noise_level = aom_noise_psd_get_default_value(
+      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+  for (int i = 0; i < block_size * block_size; ++i) {
+    ctx->noise_psd[0][i] = y_noise_level;
+    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+  }
+  return 1;
+}
+
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *sd,
+                              aom_film_grain_t *film_grain) {
+  const int block_size = ctx->block_size;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  uint8_t *raw_data[3] = {
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+  };
+  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+    fprintf(stderr, "Unable to realloc buffers\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+                            sd->y_height, strides[0], ctx->flat_blocks);
+
+  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+                             strides, chroma_sub_log2, ctx->noise_psd,
+                             block_size, ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to denoise image\n");
+    return 0;
+  }
+
+  const aom_noise_status_t status = aom_noise_model_update(
+      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+      block_size);
+  int have_noise_estimate = 0;
+  if (status == AOM_NOISE_STATUS_OK) {
+    have_noise_estimate = 1;
+  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+    aom_noise_model_save_latest(&ctx->noise_model);
+    have_noise_estimate = 1;
+  } else {
+    // Unable to update noise model; proceed if we have a previous estimate.
+    have_noise_estimate =
+        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+  }
+
+  film_grain->apply_grain = 0;
+  if (have_noise_estimate) {
+    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+      fprintf(stderr, "Unable to get grain parameters.\n");
+      return 0;
+    }
+    if (!film_grain->random_seed) {
+      film_grain->random_seed = 1071;
+    }
+    memcpy(raw_data[0], ctx->denoised[0],
+           (strides[0] * sd->y_height) << use_highbd);
+    memcpy(raw_data[1], ctx->denoised[1],
+           (strides[1] * sd->uv_height) << use_highbd);
+    memcpy(raw_data[2], ctx->denoised[2],
+           (strides[2] * sd->uv_height) << use_highbd);
+  }
+  return 1;
+}
--- a/third_party/aom/aom_dsp/noise_model.h
+++ b/third_party/aom/aom_dsp/noise_model.h
@ -18,6 +18,7 @@ extern "C" {

 #include <stdint.h>
 #include "aom_dsp/grain_synthesis.h"
+#include "aom_scale/yv12config.h"

 /*!\brief Wrapper of data required to represent linear system of eqns and soln.
 */
@ -280,6 +281,42 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
                          int w, int h, int stride[3], int chroma_sub_log2[2],
                          float *noise_psd[3], int block_size, int bit_depth,
                          int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
+ *                       that holds some buffers for denoising and the current
+ *                       noise estimate.
+ * \param[in/out]   buf  The raw input buffer to be denoised.
+ * \param[out]    grain  Output film grain parameters
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in]  bit_depth   Bit depth of buffers this will be run on.
+ * \param[in]  block_size  Block size for noise modeling and flat block
+ *                         estimation
+ * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
+ *                         higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@ -289,6 +289,15 @@ SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
  return c_v256_shr_s32(a, c);
 }
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+  return c_v256_shr_s64(a, c);
+}

 SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
  return c_v256_shr_n_byte(a, n);
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@ -386,7 +386,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
    }
  }

-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);

  if (!subpel_x_q3 && !subpel_y_q3) {
@ -413,12 +413,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
    const int16_t *const kernel_y =
        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
-                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                        intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
+                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                        width, intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
                       width, height);
  }
@ -974,7 +974,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
    }
  }

-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);

  if (!subpel_x_q3 && !subpel_y_q3) {
@ -1004,14 +1004,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
    const int16_t *const kernel_y =
        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
                               ref_stride, CONVERT_TO_BYTEPTR(temp),
                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
                               intermediate_height, bd);
    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
        16, width, height, bd);
  }
@ -1185,29 +1185,18 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
  }
 }

-void aom_highbd_comp_mask_upsampled_pred_c(
+void aom_highbd_comp_mask_upsampled_pred(
    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
    int bd) {
-  int i, j;
-
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
                            bd);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
-    }
-    comp_pred += width;
-    pred += width;
-    mask += mask_stride;
-  }
+  aom_highbd_comp_mask_pred(comp_pred, pred8, width, height,
+                            CONVERT_TO_BYTEPTR(comp_pred), width, mask,
+                            mask_stride, invert_mask);
 }

 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@ -76,6 +76,13 @@ void aom_comp_mask_upsampled_pred(
    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);

+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd);
+
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                          const int32_t *wsrc,
                                          const int32_t *msk);
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@ -41,25 +41,163 @@
 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__

-static void aom_filter_block1d16_h8_avx2(
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+  *((uint32_t *)(output_ptr + stride)) =
+      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_storel_epi64((__m128i *)(output_ptr + stride),
+                   _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+                                   const ptrdiff_t stride, const __m256i *a) {
+  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_store_si128((__m128i *)(output_ptr + stride),
+                  _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h8_avx2(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  __m256i addFilterReg32, filt1Reg, filt2Reg;
+  __m256i firstFilters, secondFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+  __m256i srcReg32b1;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 32 bits
+  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+  // duplicate only the second 32 bits
+  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    // filter the source buffer
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
@ -74,22 +212,17 @@ static void aom_filter_block1d16_h8_avx2(
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));

-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));

  // multiple the size of the source and destination stride by two
  src_stride = src_pixels_per_line << 1;
  dst_stride = output_pitch << 1;
  for (i = output_height; i > 1; i -= 2) {
    // load the 2 strides of source
-    srcReg32b1 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(
-        srcReg32b1,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);

    // filter the source buffer
    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@ -110,80 +243,31 @@ static void aom_filter_block1d16_h8_avx2(
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);

-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(
-        srcReg32b2,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());

    src_ptr += src_stride;

-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
    output_ptr += dst_stride;
  }

  // if the number of strides is odd.
-  // process only 16 bytes
+  // process only 8 bytes
  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcReg1, srcRegFilt1_1;
    __m128i srcRegFilt2, srcRegFilt3;

-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));

    // filter the source buffer
    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
@ -210,15 +294,172 @@ static void aom_filter_block1d16_h8_avx2(

    // add and saturate the results together
    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));

-    // reading the next 16 bytes
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // reading 2 strides of the next 16 bytes
    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));

    // filter the source buffer
    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2(

    // add and saturate the results together
    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));

+    // shift by 6 bit each 16 bit
    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);

    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
@ -269,11 +507,11 @@ static void aom_filter_block1d16_h8_avx2(
  }
 }

-static void aom_filter_block1d16_v8_avx2(
+static void aom_filter_block1d8_v8_avx2(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
-  __m256i addFilterReg64;
+  __m256i addFilterReg32;
  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
  __m256i srcReg32b11, srcReg32b12, filtersReg32;
@ -281,11 +519,11 @@ static void aom_filter_block1d16_v8_avx2(
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  addFilterReg32 = _mm256_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
@ -308,49 +546,178 @@ static void aom_filter_block1d16_v8_avx2(
  dst_stride = out_pitch << 1;

  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
  srcReg32b7 = _mm256_castsi128_si256(
      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));

  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                                       _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                                       _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-                                       _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-                                       _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-                                       _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-                                       _mm256_castsi256_si128(srcReg32b7), 1);
-
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
  // merge every two consecutive registers except the last one
  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);

  // save
  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);

  for (i = output_height; i > 1; i -= 2) {
@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2(

    // add and saturate the results together
    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));

    // multiply 2 adjacent elements with the filter and add the result
    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2(

    // add and saturate the results together
    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));

-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2(

    src_ptr += src_stride;

-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);

    output_ptr += dst_stride;

@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2(

    // add and saturate the results together
    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));

-    // add and saturate the results together
+    // shift by 6 bit each 16 bit
    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2(

 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@ -13,31 +13,27 @@
 #define AOM_DSP_X86_CONVOLVE_AVX2_H_

 // filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };

-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
 };

 static INLINE void prepare_coeffs_lowbd(
    const InterpFilterParams *const filter_params, const int subpel_q4,
    __m256i *const coeffs /* [4] */) {
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);

@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                  const int subpel_q4,
                                  __m256i *const coeffs /* [4] */) {
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);

  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                  const int subpel_q4,
                                  __m128i *const coeffs /* [4] */) {
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);

  // coeffs 0 1 0 1 0 1 0 1
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,

 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params, int bd) {
  int i, j;
@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,

 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params, int bd) {
  int i, j;
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@ -18,8 +18,8 @@

 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4,
                                    const int subpel_y_q4,
                                    ConvolveParams *conv_params, int bd) {
@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,

 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4,
                                    const int subpel_y_q4,
                                    ConvolveParams *conv_params, int bd) {
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
    }
  }

-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);

  if (!subpel_x_q3 && !subpel_y_q3) {
@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
    const int16_t *const kernel_y =
        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
                               ref_stride, CONVERT_TO_BYTEPTR(temp),
                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
                               intermediate_height, bd);
    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
        16, width, height, bd);
  }
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@ -22,118 +22,12 @@
 void aom_var_filter_block2d_bil_first_pass_ssse3(
    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
-  // in computation using _mm_maddubs_epi16.
-  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
-  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
-  const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
-  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
-                                        f0, f1, f0, f1, f0, f1);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
-  unsigned int i, j;
-  (void)pixel_step;
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        // load source
-        __m128i source_low = xx_loadl_64(a);
-        __m128i source_hi = _mm_setzero_si128();
-
-        // avoid load undefined memory
-        if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
-        __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
-
-        // shuffle to:
-        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-        __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
-        __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-
-        // round
-        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-        xx_storeu_128(b, res);
-
-        a += 8;
-        b += 8;
-      }
-
-      a += src_pixels_per_line - output_width;
-    }
-  } else {
-    for (i = 0; i < output_height; ++i) {
-      // load source, only first 5 values are meaningful:
-      // { a[0], a[1], a[2], a[3], a[4], xxxx }
-      __m128i source = xx_loadl_64(a);
-
-      // shuffle, up to the first 8 are useful
-      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-      xx_storel_64(b, res);
-
-      a += src_pixels_per_line;
-      b += output_width;
-    }
-  }
-}
+    unsigned int output_width, const uint8_t *filter);

 void aom_var_filter_block2d_bil_second_pass_ssse3(
    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  const int16_t round = (1 << FILTER_BITS) >> 1;
-  const __m128i r = _mm_set1_epi32(round);
-  const __m128i filters =
-      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
-                     filter[1], filter[0], filter[1]);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-  const __m128i mask =
-      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 4) {
-      // load source as:
-      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
-      __m128i source1 = xx_loadl_64(a);
-      __m128i source2 = xx_loadl_64(a + pixel_step);
-      __m128i source = _mm_unpacklo_epi64(source1, source2);
-
-      // shuffle source to:
-      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
-      __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
-      // round
-      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
-      // shuffle to get each lower 8 bit of every 32 bit
-      res = _mm_shuffle_epi8(res, mask);
-
-      xx_storel_32(b, res);
-
-      a += 4;
-      b += 4;
-    }
-
-    a += src_pixels_per_line - output_width;
-  }
-}
+    unsigned int output_width, const uint8_t *filter);

 static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
                                        const __m128i *w, const __m128i *r,
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 32) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+  __m256i a = _mm256_castsi128_si256(a0);
+  return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+    // Calculate 16 predicted pixels.
+    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+    // is 64 * 255, so we have plenty of space to add rounding constants.
+    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+                               msk, msk_stride, invert_mask, m, n);           \
+  }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    // Zero-extend mask to 16 bits
+    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(m_ptr)),
+        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+    // so it is safe to do signed saturation here.
+    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+    // There is no 16-bit SAD instruction, so we have to synthesize
+    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+    // and accumulating them at the end
+    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m256i m =
+          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                           second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+                                        second_pred, m, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+                                         second_pred, m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                           ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+                                        ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                         ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
+  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
+      int msk_stride, int invert_mask) {                                  \
+    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+                                      second_pred8, msk, msk_stride,      \
+                                      invert_mask, m, n);                 \
+  }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4);
+HIGHBD_MASKSADMXN_AVX2(4, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 16);
+HIGHBD_MASKSADMXN_AVX2(32, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 32);
+HIGHBD_MASKSADMXN_AVX2(64, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 128);
+HIGHBD_MASKSADMXN_AVX2(128, 64);
+HIGHBD_MASKSADMXN_AVX2(128, 128);
+HIGHBD_MASKSADMXN_AVX2(4, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 16);
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@ -19,6 +19,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"

+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
 // For width a multiple of 16
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                            int src_stride,
@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height);

-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
 #define MASKSADMXN_SSSE3(m, n)                                                \
  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
      int invert_mask) {                                                      \
    if (!invert_mask)                                                         \
-      return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,            \
-                                 second_pred, 8, msk, msk_stride, n);         \
+      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 8, msk, msk_stride, n);     \
    else                                                                      \
-      return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,        \
-                                 ref_stride, msk, msk_stride, n);             \
+      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
  }

 #define MASKSAD4XN_SSSE3(n)                                                   \
@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
      int invert_mask) {                                                      \
    if (!invert_mask)                                                         \
-      return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,            \
-                                 second_pred, 4, msk, msk_stride, n);         \
+      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 4, msk, msk_stride, n);     \
    else                                                                      \
-      return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,        \
-                                 ref_stride, msk, msk_stride, n);             \
+      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
  }

 MASKSADMXN_SSSE3(128, 128)
@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
  return (sad + 31) >> 6;
 }

-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3(
  return (sad + 31) >> 6;
 }

-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height);

-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
 #define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
      int msk_stride, int invert_mask) {                                       \
    if (!invert_mask)                                                          \
-      return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride,    \
-                                        second_pred8, 4, msk, msk_stride, n);  \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
+                                            ref_stride, second_pred8, 4, msk,  \
+                                            msk_stride, n);                    \
    else                                                                       \
-      return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4,     \
-                                        ref8, ref_stride, msk, msk_stride, n); \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+                                            ref8, ref_stride, msk, msk_stride, \
+                                            n);                                \
  }

 HIGHBD_MASKSADMXN_SSSE3(128, 128)
@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
  return (sad + 31) >> 6;
 }

-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height);
+
+#endif
--- a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int height) {
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  do {
+    const __m128i v_p_b_0 = xx_loadl_32(pre);
+    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_b = xx_loadl_64(pre + n);
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if ((n & (width - 1)) == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h)                                          \
+  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *msk) {                                       \
+    if (w == 4) {                                                 \
+      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
+    } else {                                                      \
+      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+    }                                                             \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+                                                const int pre_stride,
+                                                const int32_t *wsrc,
+                                                const int32_t *mask,
+                                                const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  do {
+    const __m128i v_p_w_0 = xx_loadl_64(pre);
+    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                           \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask) {                                           \
+    if (w == 4) {                                                      \
+      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                           \
+      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                                  \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@ -26,6 +26,16 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////

+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
 static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
                                    const int32_t *wsrc, const int32_t *mask,
                                    unsigned int *const sse, int *const sum,
@ -152,6 +162,46 @@ OBMCVARWXH(32, 8)
 OBMCVARWXH(16, 64)
 OBMCVARWXH(64, 16)

+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H)                                                \
+  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
+  }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
--- a/third_party/aom/aom_dsp/x86/subtract_avx2.c
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+                                   const uint8_t *pred_ptr) {
+  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static INLINE void aom_subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+    __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_128xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 32:
+      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 64:
+      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 128:
+      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                    src_stride, pred_ptr, pred_stride);
+      break;
+    default:
+      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+                                   __m256i *in0, __m256i *in1, const __m256i _r,
+                                   const int32_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, _r);
+  __m256i a1 = _mm256_add_epi32(u1, _r);
+  __m256i b0 = _mm256_add_epi32(v0, _r);
+  __m256i b1 = _mm256_add_epi32(v1, _r);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  *in0 = _mm256_packs_epi32(c0, c1);
+  *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_adds_epi16(_in0, _in1);
+  *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_add_epi32(_in0, _in1);
+  *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+                                             __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_adds_epi16(_in0, _in1);
+  *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+                                           __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_add_epi32(_in0, _in1);
+  *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+                                                        int stride,
+                                                        __m256i *out,
+                                                        int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
+  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
+  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
+  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
+  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
+  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
+  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
+  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
+  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
+  // to:
+  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  // ...
+  __m256i a[16];
+  for (int i = 0; i < 16; i += 2) {
+    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+  }
+  __m256i b[16];
+  for (int i = 0; i < 16; i += 2) {
+    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+  }
+  __m256i c[16];
+  for (int i = 0; i < 16; i += 2) {
+    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+  }
+  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_adds_epi16(in[i], round);
+      in[i] = _mm256_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }

+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
 static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                            const __m256i a,
                                            uint8_t *comp_pred) {
@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
    } while (i < height);
  }
 }
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+                                                      const __m256i s1,
+                                                      const __m256i a) {
+  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+  const __m256i pred_l = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+  const __m256i pred_h = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m256i zero = _mm256_setzero_si256();
+
+  if (width == 8) {
+    do {
+      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+      __m256i m = _mm256_castsi128_si256(m_l);
+      m = _mm256_insertf128_si256(m, m_h, 1);
+      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+      _mm_storeu_si128((__m128i *)(comp_pred + width),
+                       _mm256_extractf128_si256(comp, 1));
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      comp_pred += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+      const __m256i m_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
+      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
+      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+
+      const __m256i m01_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+      const __m256i m23_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = xx_loadl_64(a + 1);
+
+        // unpack to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    const __m128i shuffle_mask =
+        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
    }
  }

-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);

  if (!subpel_x_q3 && !subpel_y_q3) {
@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
    const int16_t *const kernel_y =
        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
-                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                        intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
+                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                        width, intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
                       width, height);
  }
--- a/third_party/aom/aom_ports/aom_once.h
+++ b/third_party/aom/aom_ports/aom_once.h
@ -17,7 +17,7 @@
 /* Implement a function wrapper to guarantee initialization
 * thread-safety for library singletons.
 *
- * NOTE: These functions use static locks, and can only be
+ * NOTE: This function uses static locks, and can only be
 * used with one common argument per compilation unit. So
 *
 * file1.c:
@ -25,8 +25,8 @@
 *   ...
 *   aom_once(foo);
 *
- *   file2.c:
- *     aom_once(bar);
+ * file2.c:
+ *   aom_once(bar);
 *
 * will ensure foo() and bar() are each called only once, but in
 *
@ -46,19 +46,19 @@
 * local initializers are not thread-safe in MSVC prior to Visual
 * Studio 2015.
 *
- * As a static, once_state will be zero-initialized as program start.
+ * As a static, aom_once_state will be zero-initialized as program start.
 */
-static LONG once_state;
-static void once(void (*func)(void)) {
-  /* Try to advance once_state from its initial value of 0 to 1.
+static LONG aom_once_state;
+static void aom_once(void (*func)(void)) {
+  /* Try to advance aom_once_state from its initial value of 0 to 1.
   * Only one thread can succeed in doing so.
   */
-  if (InterlockedCompareExchange(&once_state, 1, 0) == 0) {
-    /* We're the winning thread, having set once_state to 1.
+  if (InterlockedCompareExchange(&aom_once_state, 1, 0) == 0) {
+    /* We're the winning thread, having set aom_once_state to 1.
     * Call our function. */
    func();
-    /* Now advance once_state to 2, unblocking any other threads. */
-    InterlockedIncrement(&once_state);
+    /* Now advance aom_once_state to 2, unblocking any other threads. */
+    InterlockedIncrement(&aom_once_state);
    return;
  }

@ -66,10 +66,10 @@ static void once(void (*func)(void)) {
   * the state variable so we don't return before func()
   * has finished executing elsewhere.
   *
-   * Try to advance once_state from 2 to 2, which is only possible
+   * Try to advance aom_once_state from 2 to 2, which is only possible
   * after the winning thead advances it from 1 to 2.
   */
-  while (InterlockedCompareExchange(&once_state, 2, 2) != 2) {
+  while (InterlockedCompareExchange(&aom_once_state, 2, 2) != 2) {
    /* State isn't yet 2. Try again.
     *
     * We are used for singleton initialization functions,
@ -83,8 +83,8 @@ static void once(void (*func)(void)) {
    Sleep(0);
  }

-  /* We've seen once_state advance to 2, so we know func()
-   * has been called. And we've left once_state as we found it,
+  /* We've seen aom_once_state advance to 2, so we know func()
+   * has been called. And we've left aom_once_state as we found it,
   * so other threads will have the same experience.
   *
   * It's safe to return now.
@ -95,7 +95,7 @@ static void once(void (*func)(void)) {
 #elif CONFIG_MULTITHREAD && defined(__OS2__)
 #define INCL_DOS
 #include <os2.h>
-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
  static int done;

  /* If the initialization is complete, return early. */
@ -117,18 +117,15 @@ static void once(void (*func)(void)) {

 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
  static pthread_once_t lock = PTHREAD_ONCE_INIT;
  pthread_once(&lock, func);
 }

 #else
-/* No-op version that performs no synchronization. *_rtcd() is idempotent,
- * so as long as your platform provides atomic loads/stores of pointers
- * no synchronization is strictly necessary.
- */
+/* Default version that performs no synchronization. */

-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
  static int done;

  if (!done) {
--- a/third_party/aom/aom_scale/aom_scale_rtcd.c
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.c
@ -15,4 +15,4 @@

 #include "aom_ports/aom_once.h"

-void aom_scale_rtcd() { once(setup_rtcd_internal); }
+void aom_scale_rtcd() { aom_once(setup_rtcd_internal); }
--- a/third_party/aom/aom_scale/generic/yv12config.c
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@ -51,6 +51,10 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                             aom_codec_frame_buffer_t *fb,
                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
  if (ybf) {
+#if CONFIG_SIZE_LIMIT
+    if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
    const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
    const int aligned_width = (width + 7) & ~7;
    const int aligned_height = (height + 7) & ~7;
@ -154,7 +158,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                                       (uv_border_h * uv_stride) + uv_border_w,
                                   aom_byte_align);

-    ybf->use_external_refernce_buffers = 0;
+    ybf->use_external_reference_buffers = 0;

    if (use_highbitdepth) {
      if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
--- a/third_party/aom/aom_scale/yv12config.h
+++ b/third_party/aom/aom_scale/yv12config.h
@ -81,7 +81,7 @@ typedef struct yv12_buffer_config {

  // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
  // allocated memory or external buffers.
-  int use_external_refernce_buffers;
+  int use_external_reference_buffers;
  // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
  // uses an external refernece, and restore those buffer pointers after the
  // external reference frame is no longer used.
--- a/third_party/aom/aom_util/aom_thread.h
+++ b/third_party/aom/aom_util/aom_thread.h
@ -369,7 +369,8 @@ typedef enum {
 } AVxWorkerStatus;

 // Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
 typedef int (*AVxWorkerHook)(void *, void *);

 // Platform-dependent implementation details for the worker.
@ -382,7 +383,7 @@ typedef struct {
  AVxWorkerHook hook;  // hook to call
  void *data1;         // first argument passed to 'hook'
  void *data2;         // second argument passed to 'hook'
-  int had_error;       // return value of the last call to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
 } AVxWorker;

 // The interface for all thread-worker related functions. All these functions
--- a/third_party/aom/apps/aomdec.c
+++ b/third_party/aom/apps/aomdec.c
@ -83,6 +83,8 @@ static const arg_def_t outputfile =
    ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg =
    ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading");
 static const arg_def_t verbosearg =
    ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t scalearg =
@ -114,12 +116,12 @@ static const arg_def_t outallarg = ARG_DEF(
    NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");

 static const arg_def_t *all_args[] = {
-  &help,           &codecarg,   &use_yv12,    &use_i420,      &flipuvarg,
-  &rawvideo,       &noblitarg,  &progressarg, &limitarg,      &skiparg,
-  &postprocarg,    &summaryarg, &outputfile,  &threadsarg,    &verbosearg,
-  &scalearg,       &fb_arg,     &md5arg,      &framestatsarg, &continuearg,
-  &outbitdeptharg, &tilem,      &tiler,       &tilec,         &isannexb,
-  &oppointarg,     &outallarg,  NULL
+  &help,        &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,
+  &rawvideo,    &noblitarg,      &progressarg, &limitarg,   &skiparg,
+  &postprocarg, &summaryarg,     &outputfile,  &threadsarg, &rowmtarg,
+  &verbosearg,  &scalearg,       &fb_arg,      &md5arg,     &framestatsarg,
+  &continuearg, &outbitdeptharg, &tilem,       &tiler,      &tilec,
+  &isannexb,    &oppointarg,     &outallarg,   NULL
 };

 #if CONFIG_LIBYUV
@ -512,6 +514,7 @@ static int main_loop(int argc, const char **argv_) {
  int do_scale = 0;
  int operating_point = 0;
  int output_all_layers = 0;
+  unsigned int row_mt = 0;
  aom_image_t *scaled_img = NULL;
  aom_image_t *img_shifted = NULL;
  int frame_avail, got_data, flush_decoder = 0;
@ -601,6 +604,15 @@ static int main_loop(int argc, const char **argv_) {
      summary = 1;
    } else if (arg_match(&arg, &threadsarg, argi)) {
      cfg.threads = arg_parse_uint(&arg);
+#if !CONFIG_MULTITHREAD
+      if (cfg.threads > 1) {
+        die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = "
+            "0.\n",
+            cfg.threads);
+      }
+#endif
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      row_mt = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &verbosearg, argi)) {
      quiet = 0;
    } else if (arg_match(&arg, &scalearg, argi)) {
@ -763,6 +775,11 @@ static int main_loop(int argc, const char **argv_) {
            aom_codec_error(&decoder));
    goto fail;
  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_ROW_MT, row_mt)) {
+    fprintf(stderr, "Failed to set row_mt: %s\n", aom_codec_error(&decoder));
+    goto fail;
+  }
 #endif

  if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
@ -910,9 +927,8 @@ static int main_loop(int argc, const char **argv_) {
        // Shift up or down if necessary
        if (output_bit_depth != 0) {
          const aom_img_fmt_t shifted_fmt =
-              output_bit_depth == 8
-                  ? img->fmt ^ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-                  : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+              output_bit_depth == 8 ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH
+                                    : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;

          if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
            if (img_shifted &&
--- a/third_party/aom/apps/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@ -475,6 +475,13 @@ static const arg_def_t film_grain_test =
 static const arg_def_t film_grain_table =
    ARG_DEF(NULL, "film-grain-table", 1,
            "Path to file containing film grain parameters");
+#if CONFIG_DENOISE
+static const arg_def_t denoise_noise_level =
+    ARG_DEF(NULL, "denoise-noise-level", 1,
+            "Amount of noise (from 0 = don't denoise, to 50)");
+static const arg_def_t denoise_block_size =
+    ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)");
+#endif
 static const arg_def_t enable_ref_frame_mvs =
    ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
            "Enable temporal mv prediction (default is 1)");
@ -656,6 +663,10 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                       &timing_info,
                                       &film_grain_test,
                                       &film_grain_table,
+#if CONFIG_DENOISE
+                                       &denoise_noise_level,
+                                       &denoise_block_size,
+#endif
                                       &enable_ref_frame_mvs,
                                       &bitdeptharg,
                                       &inbitdeptharg,
@ -708,6 +719,10 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                        AV1E_SET_TIMING_INFO_TYPE,
                                        AV1E_SET_FILM_GRAIN_TEST_VECTOR,
                                        AV1E_SET_FILM_GRAIN_TABLE,
+#if CONFIG_DENOISE
+                                        AV1E_SET_DENOISE_NOISE_LEVEL,
+                                        AV1E_SET_DENOISE_BLOCK_SIZE,
+#endif
                                        AV1E_SET_ENABLE_REF_FRAME_MVS,
                                        AV1E_SET_ENABLE_DF,
                                        AV1E_SET_ENABLE_ORDER_HINT,
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@ -45,7 +45,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES
            "${AOM_ROOT}/av1/common/entropymv.c"
            "${AOM_ROOT}/av1/common/entropymv.h"
            "${AOM_ROOT}/av1/common/enums.h"
-            "${AOM_ROOT}/av1/common/filter.c"
            "${AOM_ROOT}/av1/common/filter.h"
            "${AOM_ROOT}/av1/common/frame_buffers.c"
            "${AOM_ROOT}/av1/common/frame_buffers.h"
@ -274,7 +273,10 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c")
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@ -296,7 +298,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
-            "${AOM_ROOT}/av1/common/arm/intrapred_neon.c"
+            "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
            "${AOM_ROOT}/av1/common/cdef_block_neon.c")

 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@ -94,6 +94,10 @@ struct av1_extracfg {
  int enable_warped_motion;  // sequence level
  int allow_warped_motion;   // frame level
  int enable_superres;
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
 };

 static struct av1_extracfg default_extra_cfg = {
@ -160,6 +164,10 @@ static struct av1_extracfg default_extra_cfg = {
  1,                            // enable_warped_motion at sequence level
  1,                            // allow_warped_motion at frame level
  1,                            // superres
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+#endif
 };

 struct aom_codec_alg_priv {
@ -464,7 +472,7 @@ static aom_codec_err_t set_encoder_config(
    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
    oxcf->timing_info.equal_picture_interval = 0;
    oxcf->decoder_model_info_present_flag = 1;
-    oxcf->buffer_removal_delay_present = 1;
+    oxcf->buffer_removal_time_present = 1;
    oxcf->display_model_info_present_flag = 1;
  }
  if (oxcf->init_framerate > 180) {
@ -612,6 +620,10 @@ static aom_codec_err_t set_encoder_config(
    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
  }
+#if CONFIG_DENOISE
+  oxcf->noise_level = extra_cfg->noise_level;
+  oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
  oxcf->large_scale_tile = cfg->large_scale_tile;
  oxcf->single_tile_decoding =
      (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
@ -710,7 +722,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
    ctx->cfg = *cfg;
    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+    force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
    av1_change_config(ctx->cpi, &ctx->oxcf);
  }

@ -1055,6 +1067,23 @@ static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
  return update_extra_cfg(ctx, &extra_cfg);
 }

+#if CONFIG_DENOISE
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_level =
+      ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
  struct av1_extracfg extra_cfg = ctx->extra_cfg;
@ -1119,7 +1148,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
    }

    priv->extra_cfg = default_extra_cfg;
-    once(av1_initialize_enc);
+    aom_once(av1_initialize_enc);

    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);

@ -1200,6 +1229,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,

  volatile aom_enc_frame_flags_t flags = enc_flags;

+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
  if (setjmp(cpi->common.error.jmp)) {
    cpi->common.error.setjmp = 0;
    res = update_error_state(ctx, &cpi->common.error);
@ -1259,7 +1291,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
      if (cx_data_sz < ctx->cx_data_sz / 2) {
        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
                           "Compressed data buffer too small");
-        return AOM_CODEC_ERROR;
      }
    }

@ -1275,8 +1306,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                         !img, timebase)) {
      if (cpi->common.seq_params.frame_id_numbers_present_flag) {
        if (cpi->common.invalid_delta_frame_id_minus_1) {
-          ctx->base.err_detail = "Invalid delta_frame_id_minus_1";
-          return AOM_CODEC_ERROR;
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                             "Invalid delta_frame_id_minus_1");
        }
      }
      cpi->seq_params_locked = 1;
@ -1305,7 +1336,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
          if (write_uleb_obu_size(obu_header_size, obu_payload_size,
                                  ctx->pending_cx_data) != AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
          }

          frame_size += obu_header_size + obu_payload_size + length_field_size;
@ -1315,7 +1346,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
          size_t curr_frame_size = frame_size;
          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
              AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
          }
          frame_size = curr_frame_size;

@ -1327,7 +1358,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
          }
          if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
              AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
          }
          frame_size += length_field_size;
        }
@ -1358,7 +1389,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
        }
        if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
            AOM_CODEC_OK) {
-          return AOM_CODEC_ERROR;
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
        }
        ctx->pending_cx_data_sz += length_field_size;
      }
@ -1710,6 +1741,10 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
  { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
  { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
  { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+#if CONFIG_DENOISE
+  { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+  { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+#endif  // CONFIG_FILM_GRAIN
  { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },

  // Getters
@ -1728,7 +1763,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
    {
        // NOLINT
        0,  // g_usage
-        8,  // g_threads
+        0,  // g_threads
        0,  // g_profile

        320,         // g_width
@ -1810,7 +1845,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
      NULL,  // aom_codec_peek_si_fn_t
      NULL,  // aom_codec_get_si_fn_t
      NULL,  // aom_codec_decode_fn_t
-      NULL,  // aom_codec_frame_get_fn_t
+      NULL,  // aom_codec_get_frame_fn_t
      NULL   // aom_codec_set_fb_fn_t
  },
  {
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@ -50,6 +50,7 @@ struct aom_codec_alg_priv {
  int decode_tile_col;
  unsigned int tile_mode;
  unsigned int ext_tile_debug;
+  unsigned int row_mt;
  EXTERNAL_REFERENCES ext_refs;
  unsigned int is_annexb;
  int operating_point;
@ -61,7 +62,7 @@ struct aom_codec_alg_priv {
  int last_submit_worker_id;
  int next_output_worker_id;
  int available_threads;
-  aom_image_t *image_with_grain;
+  aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
  int need_resync;  // wait for key/intra-only frame
  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
  BufferPool *buffer_pool;
@ -101,7 +102,7 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
      // default values
      priv->cfg.cfg.ext_partition = 1;
    }
-    priv->image_with_grain = NULL;
+    av1_zero(priv->image_with_grain);
  }

  return AOM_CODEC_OK;
@ -139,7 +140,9 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {

  aom_free(ctx->frame_workers);
  aom_free(ctx->buffer_pool);
-  if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain);
+  for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
+    if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
+  }
  aom_free(ctx);
  return AOM_CODEC_OK;
 }
@ -339,16 +342,16 @@ static int frame_worker_hook(void *arg1, void *arg2) {
  const uint8_t *data = frame_worker_data->data;
  (void)arg2;

-  frame_worker_data->result = av1_receive_compressed_data(
-      frame_worker_data->pbi, frame_worker_data->data_size, &data);
+  int result = av1_receive_compressed_data(frame_worker_data->pbi,
+                                           frame_worker_data->data_size, &data);
  frame_worker_data->data_end = data;

-  if (frame_worker_data->result != 0) {
+  if (result != 0) {
    // Check decode result in serial decode.
    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
    frame_worker_data->pbi->need_resync = 1;
  }
-  return !frame_worker_data->result;
+  return !result;
 }

 static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
@ -429,6 +432,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
    frame_worker_data->pbi->operating_point = ctx->operating_point;
    frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
    frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+    frame_worker_data->pbi->row_mt = ctx->row_mt;

    worker->hook = (AVxWorkerHook)frame_worker_hook;
    if (!winterface->reset(worker)) {
@ -489,6 +493,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
  frame_worker_data->pbi->ext_refs = ctx->ext_refs;

  frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
@ -592,21 +597,31 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
  return res;
 }

-aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf,
-                                 aom_film_grain_t *grain_params) {
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
+// if necessary), and returns *grain_img_ptr.
+static aom_image_t *add_grain_if_needed(aom_image_t *img,
+                                        aom_image_t **grain_img_ptr,
+                                        aom_film_grain_t *grain_params) {
  if (!grain_params->apply_grain) return img;

-  if (grain_img_buf &&
-      (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h ||
-       img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) {
-    aom_img_free(grain_img_buf);
-    grain_img_buf = NULL;
+  aom_image_t *grain_img_buf = *grain_img_ptr;
+
+  const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+  const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+  if (grain_img_buf) {
+    const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
+    const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
+    if (w_even != alloc_w || h_even != alloc_h ||
+        img->fmt != grain_img_buf->fmt) {
+      aom_img_free(grain_img_buf);
+      grain_img_buf = NULL;
+    }
  }
  if (!grain_img_buf) {
-    int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w;
-    int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h;
    grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
-    grain_img_buf->bit_depth = img->bit_depth;
+    *grain_img_ptr = grain_img_buf;
  }

  av1_add_film_grain(grain_params, img, grain_img_buf);
@ -649,8 +664,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
        aom_film_grain_t *grain_params;
        if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
                              &grain_params) == 0) {
-          *index += 1;  // Advance the iterator to point to the next image
-
          AV1Decoder *const pbi = frame_worker_data->pbi;
          AV1_COMMON *const cm = &pbi->common;
          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
@ -659,6 +672,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
          yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);

          if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+            *index += 1;  // Advance the iterator to point to the next image
            img = &ctx->img;
            img->img_data = pbi->tile_list_output;
            img->sz = pbi->tile_list_size;
@ -688,11 +702,14 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
            const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
            const int mi_col = tile_col * cm->tile_width;
            const int ssx = ctx->img.x_chroma_shift;
+            const int is_hbd =
+                (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
            int plane;
-            ctx->img.planes[0] += mi_col * MI_SIZE;
+            ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
            if (num_planes > 1) {
              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-                ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+                ctx->img.planes[plane] +=
+                    mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
              }
            }
            ctx->img.d_w =
@ -703,7 +720,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
          img = &ctx->img;
          img->temporal_id = cm->temporal_layer_id;
          img->spatial_id = cm->spatial_layer_id;
-          return add_grain_if_needed(img, ctx->image_with_grain, grain_params);
+          aom_image_t *res = add_grain_if_needed(
+              img, &ctx->image_with_grain[*index], grain_params);
+          *index += 1;  // Advance the iterator to point to the next image
+          return res;
        }
      } else {
        // Decoding failed. Release the worker thread.
@ -999,7 +1019,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
      FrameWorkerData *const frame_worker_data =
          (FrameWorkerData *)worker->data1;
      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *bit_depth = cm->bit_depth;
+      *bit_depth = cm->seq_params.bit_depth;
      return AOM_CODEC_OK;
    } else {
      return AOM_CODEC_ERROR;
@ -1009,6 +1029,64 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
  return AOM_CODEC_INVALID_PARAM;
 }

+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+                                    int use_highbitdepth) {
+  aom_img_fmt_t fmt = 0;
+
+  if (subsampling_x == 0 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I444;
+  else if (subsampling_x == 1 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I422;
+  else if (subsampling_x == 1 && subsampling_y == 1)
+    fmt = AOM_IMG_FMT_I420;
+
+  if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (img_fmt) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+                                cm->seq_params.subsampling_y,
+                                cm->seq_params.use_highbitdepth);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const tile_size = va_arg(args, unsigned int *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (tile_size) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *tile_size =
+          ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
 static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
  ctx->invert_tile_order = va_arg(args, int);
@ -1124,6 +1202,12 @@ static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
  return AOM_CODEC_OK;
 }

+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
  { AV1_COPY_REFERENCE, ctrl_copy_reference },

@ -1145,6 +1229,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
  { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
  { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
  { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_ROW_MT, ctrl_set_row_mt },
  { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },

  // Getters
@ -1152,6 +1237,8 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
  { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
  { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
  { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+  { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+  { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
  { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
  { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
  { AV1_GET_ACCOUNTING, ctrl_get_accounting },
@ -1180,7 +1267,7 @@ CODEC_INTERFACE(aom_codec_av1_dx) = {
      decoder_peek_si,    // aom_codec_peek_si_fn_t
      decoder_get_si,     // aom_codec_get_si_fn_t
      decoder_decode,     // aom_codec_decode_fn_t
-      decoder_get_frame,  // aom_codec_frame_get_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
  },
  {
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@ -137,11 +137,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
  // Now we need to allocate enough space to store the line buffers for the
  // stripes
  const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;

  for (int p = 0; p < num_planes; ++p) {
    const int is_uv = p > 0;
-    const int ss_x = is_uv && cm->subsampling_x;
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+
+static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  TxSetType tx_set_type;
+  if (tx_size_sqr_up > TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCTONLY;
+  } else if (tx_size_sqr_up == TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCT_IDTX;
+  } else {
+    tx_set_type = EXT_TX_SET_ALL16;
+  }
+  return tx_set_type;
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+  { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+  { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+  { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+  { av1_idct32_new, NULL, NULL },
+  { av1_idct64_new, NULL, NULL },
+};
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { av1_idct4_new, av1_idct4_new, NULL, NULL },
+          { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+          { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+      },
+      { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+        { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+        { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+      {
+          { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+          { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+          { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+      },
+      { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+        { NULL, NULL, NULL, NULL },
+        { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+          av1_iidentity32_c } },
+      { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    for (r = 0; r < txfm_size_row; ++r) {
+      output[r * stride + c] =
+          highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+                                                 uint8_t *output, int stride,
+                                                 TX_TYPE tx_type,
+                                                 TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby, ud_flip, lr_flip, row_start;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int bd = 8;
+  int r;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  row_start = (buf_size_nonzero_h_div8 << 3);
+
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+  }
+}
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  int row;
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X64: {
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X16: {
+      int32_t mod_input[64 * 16];
+      for (row = 0; row < 16; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_32X64: {
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X32: {
+      int32_t mod_input[64 * 32];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X64: {
+      int32_t mod_input[64 * 64];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    default:
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+                                  const int8_t cos_bit,
+                                  const int8_t *stage_ptr);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+#endif  // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@ -164,8 +164,8 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(

 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                            int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                            const int subpel_x_q4, const int subpel_y_q4,
                            ConvolveParams *conv_params) {
  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
@ -182,7 +182,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));

  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);

  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@ -485,8 +485,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                            int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                            const int subpel_x_q4, const int subpel_y_q4,
                            ConvolveParams *conv_params) {
  const int vert_offset = filter_params_y->taps / 2 - 1;
@ -502,7 +502,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));

  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);

  if (w <= 4) {
    uint8x8_t d01, d23;
@ -680,8 +680,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
  int im_dst_stride;
@ -711,7 +711,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);

  int16_t x_filter_tmp[8];
  int16x8_t filter_x_coef = vld1q_s16(x_filter);
@ -896,7 +896,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                              (1 << (offset_bits - conv_params->round_1 - 1));
    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);

    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@ -1086,8 +1086,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 }
 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params) {
  (void)filter_params_x;
--- a/third_party/aom/av1/common/arm/intrapred_neon.c
+++ b/third_party/aom/av1/common/arm/intrapred_neon.c
@ -1,79 +0,0 @@
-/*
- *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
-                                            int bw, const uint16_t *above,
-                                            const uint16_t *left) {
-  assert(bw >= 4);
-  assert(IS_POWER_OF_TWO(bw));
-  int expected_dc, sum = 0;
-  const int count = bw * 2;
-  uint32x4_t sum_q = vdupq_n_u32(0);
-  uint32x2_t sum_d;
-  uint16_t *dst_1;
-  if (bw >= 8) {
-    for (int i = 0; i < bw; i += 8) {
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
-      above += 8;
-      left += 8;
-    }
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      dst_1 = dst;
-      for (int i = 0; i < bw; i += 8) {
-        vst1q_u16(dst_1, dc);
-        dst_1 += 8;
-      }
-      dst += stride;
-    }
-  } else {  // 4x4
-    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      vst1_u16(dst, dc);
-      dst += stride;
-    }
-  }
-}
-
-#define intra_pred_highbd_sized(type, width)                         \
-  void aom_highbd_##type##_predictor_##width##x##width##_neon(       \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,        \
-      const uint16_t *left, int bd) {                                \
-    (void)bd;                                                        \
-    highbd_##type##_predictor_neon(dst, stride, width, above, left); \
-  }
-
-#define intra_pred_square(type)      \
-  intra_pred_highbd_sized(type, 4);  \
-  intra_pred_highbd_sized(type, 8);  \
-  intra_pred_highbd_sized(type, 16); \
-  intra_pred_highbd_sized(type, 32); \
-  intra_pred_highbd_sized(type, 64);
-
-intra_pred_square(dc);
-
-#undef intra_pred_square
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@ -515,8 +515,8 @@ static INLINE void jnt_convolve_2d_vert_neon(

 void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                              int dst8_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
+                              const InterpFilterParams *filter_params_x,
+                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
  assert(!(w % 4));
@ -532,9 +532,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
  const int round_0 = conv_params->round_0 - 1;
  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);

  int16_t x_filter_tmp[8];
  int16x8_t filter_x_coef = vld1q_s16(x_filter);
@ -553,8 +553,8 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,

 void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params) {
  uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
@ -679,8 +679,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,

 void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                             int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
  assert(!(w % 4));
@ -705,7 +705,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,

  // horizontal filter
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);

  const uint8_t *src_ptr = src - horiz_offset;

@ -1013,8 +1013,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,

 void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                             int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
  assert(!(w % 4));
@ -1040,7 +1040,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,

  // vertical filter
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);

  const uint8_t *src_ptr = src - (vert_offset * src_stride);

--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@ -22,6 +22,14 @@ static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
  s += p;
 }

+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define load_u8_4x1(s, s0, lane)                                           \
+  do {                                                                     \
+    *(s0) = vreinterpret_u8_u32(                                           \
+        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+  } while (0)
+
 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
                               uint8x8_t *const s0, uint8x8_t *const s1,
                               uint8x8_t *const s2, uint8x8_t *const s3,
@ -128,6 +136,13 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
  *s3 = vld1_s16(s);
 }

+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_u8_4x1(s, s0, lane)                                  \
+  do {                                                             \
+    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+  } while (0)
+
 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                const uint8x8_t s1, const uint8x8_t s2,
                                const uint8x8_t s3, const uint8x8_t s4,
@ -242,6 +257,30 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
  vst1q_s16(s, s7);
 }

+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+}
+
 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
                                int16x8_t *const s0, int16x8_t *const s1,
                                int16x8_t *const s2, int16x8_t *const s3,
@ -398,4 +437,49 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
  *tu1 = vsetq_lane_u64(a, *tu1, 1);
 }

+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+  *s1 = vld1q_s32(s);
+  s += p;
+  *s2 = vld1q_s32(s);
+  s += p;
+  *s3 = vld1q_s32(s);
+  s += p;
+  *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+  vst1q_s32(s, s1);
+  s += p;
+  vst1q_s32(s, s2);
+  s += p;
+  vst1q_s32(s, s3);
+  s += p;
+  vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+                                uint32x4_t *s2, uint32x4_t *s3,
+                                uint32x4_t *s4) {
+  *s1 = vld1q_u32(s);
+  s += p;
+  *s2 = vld1q_u32(s);
+  s += p;
+  *s3 = vld1q_u32(s);
+  s += p;
+  *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+  vst1q_u32(s, s1);
+  s += p;
+  vst1q_u32(s, s2);
+  s += p;
+  vst1q_u32(s, s3);
+  s += p;
+  vst1q_u32(s, s4);
+}
+
 #endif  // AV1_COMMON_ARM_MEM_NEON_H_
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@ -419,4 +419,42 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
  *a3 = vreinterpret_s16_s32(c1.val[1]);
 }

+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
 #endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@ -1308,7 +1308,7 @@ static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
  end <<= MI_SIZE_LOG2;
  uint8_t *ref0 = ref_buf;
  uint8_t *dst0 = dst_buf;
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
    const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
    for (int j = 0; j < 4; ++j) {
@ -1404,11 +1404,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;

-      if (cm->use_highbitdepth)
+      if (cm->seq_params.use_highbitdepth)
        highbd_filter_selectively_vert_row2(
            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
      else
        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
@ -1474,10 +1474,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;

-      if (cm->use_highbitdepth)
-        highbd_filter_selectively_horiz(
-            CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-            mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
      else
        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
@ -1652,6 +1653,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
  const int dst_stride = plane_ptr->dst.stride;
  const int y_range = (MAX_MIB_SIZE >> scale_vert);
  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
  for (int y = 0; y < y_range; y += row_step) {
    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
    for (int x = 0; x < x_range;) {
@ -1677,40 +1680,40 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
      switch (params.filter_length) {
        // apply 4-tap filtering
        case 4:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                      params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
          else
            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
                               params.hev_thr);
          break;
        case 6:  // apply 6-tap filter for chroma plane only
          assert(plane != 0);
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
                                      params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
          else
            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
                               params.hev_thr);
          break;
        // apply 8-tap filtering
        case 8:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                      params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
          else
            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
                               params.hev_thr);
          break;
        // apply 14-tap filtering
        case 14:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
-                                       cm->bit_depth);
+                                       bit_depth);
          else
            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
@ -1737,6 +1740,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
  const int dst_stride = plane_ptr->dst.stride;
  const int y_range = (MAX_MIB_SIZE >> scale_vert);
  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
  for (int x = 0; x < x_range; x += col_step) {
    uint8_t *p = dst_ptr + x * MI_SIZE;
    for (int y = 0; y < y_range;) {
@ -1762,10 +1767,10 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
      switch (params.filter_length) {
        // apply 4-tap filtering
        case 4:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
          else
            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
                                 params.hev_thr);
@ -1773,30 +1778,30 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
        // apply 6-tap filtering
        case 6:
          assert(plane != 0);
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
          else
            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
                                 params.hev_thr);
          break;
        // apply 8-tap filtering
        case 8:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
          else
            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
                                 params.hev_thr);
          break;
        // apply 14-tap filtering
        case 14:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
+                                         params.hev_thr, bit_depth);
          else
            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@ -16,7 +16,7 @@
 #include "aom_ports/aom_once.h"

 void av1_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
+  // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+  // decoder setup functions are protected by aom_once();
+  aom_once(setup_rtcd_internal);
 }
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@ -106,7 +106,7 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";

 #inv txfm
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2/;
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;

 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@ -181,7 +181,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {

  #fwd txfm
  add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
-  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;

  add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
  add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@ -241,11 +241,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
  specialize qw/av1_txb_init_levels sse4_1/;

  add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
-  specialize qw/av1_wedge_sse_from_residuals sse2/;
+  specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
-  specialize qw/av1_wedge_sign_from_residuals sse2/;
+  specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
  add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
-  specialize qw/av1_wedge_compute_delta_squares sse2/;
+  specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;

  # hash
  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
@ -288,34 +288,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 # LOOP_RESTORATION functions

 add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2/;
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;

 add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                  int sgr_params_idx, int bit_depth, int highbd";
-specialize qw/av1_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;

 # CONVOLVE_ROUND/COMPOUND_ROUND functions

-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";

-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";

  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@ -171,53 +171,6 @@ static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
 }

-static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return TX_4X4;
-    case TX_8X8: return TX_8X8;
-    case TX_16X16: return TX_16X16;
-    case TX_32X32: return TX_32X32;
-    case TX_64X64: return TX_64X64;
-    case TX_32X64: return TX_64X32;
-    case TX_64X32: return TX_32X64;
-    case TX_4X8: return TX_8X4;
-    case TX_8X4: return TX_4X8;
-    case TX_8X16: return TX_16X8;
-    case TX_16X8: return TX_8X16;
-    case TX_16X32: return TX_32X16;
-    case TX_32X16: return TX_16X32;
-    case TX_4X16: return TX_16X4;
-    case TX_16X4: return TX_4X16;
-    case TX_8X32: return TX_32X8;
-    case TX_32X8: return TX_8X32;
-    case TX_16X64: return TX_64X16;
-    case TX_64X16: return TX_16X64;
-    default: assert(0); return TX_INVALID;
-  }
-}
-
-static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT: return DCT_DCT;
-    case ADST_DCT: return DCT_ADST;
-    case DCT_ADST: return ADST_DCT;
-    case ADST_ADST: return ADST_ADST;
-    case FLIPADST_DCT: return DCT_FLIPADST;
-    case DCT_FLIPADST: return FLIPADST_DCT;
-    case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
-    case ADST_FLIPADST: return FLIPADST_ADST;
-    case FLIPADST_ADST: return ADST_FLIPADST;
-    case IDTX: return IDTX;
-    case V_DCT: return H_DCT;
-    case H_DCT: return V_DCT;
-    case V_ADST: return H_ADST;
-    case H_ADST: return V_ADST;
-    case V_FLIPADST: return H_FLIPADST;
-    case H_FLIPADST: return V_FLIPADST;
-    default: assert(0); return TX_TYPES;
-  }
-}
-
 // Utility function that returns the log of the ratio of the col and row
 // sizes.
 static INLINE int get_rect_tx_log_ratio(int col, int row) {
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@ -605,6 +605,12 @@ static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }

+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+  return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+             ? CONVERT_TO_BYTEPTR(buf16)
+             : buf16;
+}
+
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
  switch (bsize) {
    case BLOCK_4X4: return 0;
@ -674,6 +680,15 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };

+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+  0x0001,  // 0000 0000 0000 0001
+  0x0201,  // 0000 0010 0000 0001
+  0x020F,  // 0000 0010 0000 1111
+  0x0E0F,  // 0000 1110 0000 1111
+  0x0FFF,  // 0000 1111 1111 1111
+  0xFFFF,  // 1111 1111 1111 1111
+};
+
 static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                int use_reduced_set) {
  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@ -1145,38 +1160,6 @@ static INLINE PLANE_TYPE get_plane_type(int plane) {
  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
 }

-static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
-                                   const uint8_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
-                                    const uint16_t *src, int src_stride, int w,
-                                    int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int16(int16_t *dst, int dst_stride,
-                                   const int16_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int32(int32_t *dst, int dst_stride,
-                                   const int32_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
 static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
    return 1024;
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@ -110,7 +110,7 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
 static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
                        const uint8_t *src, int src_voffset, int src_hoffset,
                        int sstride, int vsize, int hsize) {
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
    const uint16_t *base =
        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@ -153,7 +153,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
  int mi_high_l2[3];
  int xdec[3];
  int ydec[3];
-  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@ -363,7 +363,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                    vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
        }

-        if (cm->use_highbitdepth) {
+        if (cm->seq_params.use_highbitdepth) {
          cdef_filter_fb(
              NULL,
              &CONVERT_TO_SHORTPTR(
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@ -15,21 +15,14 @@

 #include "config/av1_rtcd.h"

-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
-  if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
-      !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
-      !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
-                       "CfL, %d %d subsampling is not supported.\n",
-                       cm->subsampling_x, cm->subsampling_y);
-  }
+
  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
-  cfl->subsampling_x = cm->subsampling_x;
-  cfl->subsampling_y = cm->subsampling_y;
+  cfl->subsampling_x = seq_params->subsampling_x;
+  cfl->subsampling_y = seq_params->subsampling_y;
  cfl->are_parameters_computed = 0;
  cfl->store_y = 0;
  // The DC_PRED cache is disabled by default and is only enabled in
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@ -75,8 +75,8 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,

 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                          const int subpel_x_q4, const int subpel_y_q4,
                          ConvolveParams *conv_params) {
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@ -91,7 +91,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
  // horizontal filter
  const uint8_t *src_horiz = src - fo_vert * src_stride;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < im_h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
@ -107,7 +107,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
  // vertical filter
  int16_t *src_vert = im_block + fo_vert * im_stride;
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
@ -126,8 +126,8 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                         int dst_stride, int w, int h,
-                         InterpFilterParams *filter_params_x,
-                         InterpFilterParams *filter_params_y,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
                         const int subpel_x_q4, const int subpel_y_q4,
                         ConvolveParams *conv_params) {
  const int fo_vert = filter_params_y->taps / 2 - 1;
@ -141,7 +141,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

  // vertical filter
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -156,8 +156,8 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                         int dst_stride, int w, int h,
-                         InterpFilterParams *filter_params_x,
-                         InterpFilterParams *filter_params_y,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
                         const int subpel_x_q4, const int subpel_y_q4,
                         ConvolveParams *conv_params) {
  const int fo_horiz = filter_params_x->taps / 2 - 1;
@ -172,7 +172,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

  // horizontal filter
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -187,8 +187,8 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                               int dst_stride, int w, int h,
-                               InterpFilterParams *filter_params_x,
-                               InterpFilterParams *filter_params_y,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
                               const int subpel_x_q4, const int subpel_y_q4,
                               ConvolveParams *conv_params) {
  (void)filter_params_x;
@ -204,8 +204,8 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,

 void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                           int dst8_stride, int w, int h,
-                           InterpFilterParams *filter_params_x,
-                           InterpFilterParams *filter_params_y,
+                           const InterpFilterParams *filter_params_x,
+                           const InterpFilterParams *filter_params_y,
                           const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -222,7 +222,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
  // horizontal filter
  const uint8_t *src_horiz = src - fo_vert * src_stride;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < im_h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
@ -238,7 +238,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
  // vertical filter
  int16_t *src_vert = im_block + fo_vert * im_stride;
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
@ -270,8 +270,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,

 void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                          int dst8_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                          const int subpel_x_q4, const int subpel_y_q4,
                          ConvolveParams *conv_params) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -289,7 +289,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,

  // vertical filter
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -320,8 +320,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,

 void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                          int dst8_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                          const int subpel_x_q4, const int subpel_y_q4,
                          ConvolveParams *conv_params) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -339,7 +339,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,

  // horizontal filter
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -370,8 +370,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,

 void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
                                uint8_t *dst8, int dst8_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -412,8 +412,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,

 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                             int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_qn, const int x_step_qn,
                             const int subpel_y_qn, const int y_step_qn,
                             ConvolveParams *conv_params) {
@ -439,7 +439,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
      for (int k = 0; k < filter_params_x->taps; ++k) {
        sum += x_filter[k] * src_x[k - fo_horiz];
@ -461,7 +461,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(y_filter_idx < SUBPEL_SHIFTS);
      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
      int32_t sum = 1 << offset_bits;
      for (int k = 0; k < filter_params_y->taps; ++k) {
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@ -498,8 +498,8 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,

 static void convolve_2d_scale_wrapper(
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
    ConvolveParams *conv_params) {
  if (conv_params->is_compound) {
@ -520,25 +520,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
  (void)y_step_q4;
  (void)dst;
  (void)dst_stride;
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
-                                 &filter_params_y, w, h);
+  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size(filter_x, w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size(filter_y, h);

  if (scaled)
    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
-                              &filter_params_x, &filter_params_y, subpel_x_q4,
+                              filter_params_x, filter_params_y, subpel_x_q4,
                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
  else
    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
-        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
 }

 void av1_highbd_convolve_2d_copy_sr_c(
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
  (void)filter_params_x;
  (void)filter_params_y;
@ -554,8 +556,8 @@ void av1_highbd_convolve_2d_copy_sr_c(

 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                uint16_t *dst, int dst_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params, int bd) {
  const int fo_horiz = filter_params_x->taps / 2 - 1;
@ -569,7 +571,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,

  // horizontal filter
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -585,8 +587,8 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,

 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
                                uint16_t *dst, int dst_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params, int bd) {
  const int fo_vert = filter_params_y->taps / 2 - 1;
@ -599,7 +601,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
  // vertical filter
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -614,8 +616,8 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,

 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
-                                 InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params, int bd) {
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@ -630,7 +632,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
  // horizontal filter
  const uint16_t *src_horiz = src - fo_vert * src_stride;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < im_h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
@ -646,7 +648,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
  // vertical filter
  int16_t *src_vert = im_block + fo_vert * im_stride;
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
@ -666,8 +668,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,

 void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst16, int dst16_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params, int bd) {
  int x, y, k;
@ -685,7 +688,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
  // horizontal filter
  const uint16_t *src_horiz = src - fo_vert * src_stride;
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (y = 0; y < im_h; ++y) {
    for (x = 0; x < w; ++x) {
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
@ -703,7 +706,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
  int16_t *src_vert = im_block + fo_vert * im_stride;
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  for (y = 0; y < h; ++y) {
    for (x = 0; x < w; ++x) {
      int32_t sum = 1 << offset_bits;
@ -734,8 +737,9 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,

 void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h, InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params, int bd) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -753,7 +757,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
  assert(bits >= 0);
  // horizontal filter
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -784,8 +788,9 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,

 void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h, InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params, int bd) {
  CONV_BUF_TYPE *dst = conv_params->dst;
@ -803,7 +808,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
  assert(bits >= 0);
  // vertical filter
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
      int32_t res = 0;
@ -834,8 +839,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,

 void av1_highbd_jnt_convolve_2d_copy_c(
    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
  CONV_BUF_TYPE *dst = conv_params->dst;
  int dst_stride = conv_params->dst_stride;
@ -875,8 +880,8 @@ void av1_highbd_jnt_convolve_2d_copy_c(

 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                    const int subpel_x_qn, const int x_step_qn,
                                    const int subpel_y_qn, const int y_step_qn,
                                    ConvolveParams *conv_params, int bd) {
@ -900,7 +905,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
      for (int k = 0; k < filter_params_x->taps; ++k) {
        sum += x_filter[k] * src_x[k - fo_horiz];
@ -922,7 +927,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(y_filter_idx < SUBPEL_SHIFTS);
      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
      int32_t sum = 1 << offset_bits;
      for (int k = 0; k < filter_params_y->taps; ++k) {
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@ -971,9 +976,12 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
  (void)dst_stride;

  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
-                                 &filter_params_y, w, h);
+  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size(filter_x, w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size(filter_y, h);

  if (scaled) {
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@ -981,16 +989,16 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
      assert(conv_params->dst != NULL);
    }
    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
-                                 &filter_params_x, &filter_params_y,
-                                 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
-                                 conv_params, bd);
+                                 filter_params_x, filter_params_y, subpel_x_q4,
+                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                                 bd);
  } else {
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
                                          0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
-        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
  }
 }

--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@ -40,27 +40,17 @@ typedef struct ConvolveParams {

 typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params);

 typedef void (*aom_highbd_convolve_fn_t)(
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd);

-static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
-                                                  InterpFilterParams *params_x,
-                                                  InterpFilterParams *params_y,
-                                                  int w, int h) {
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-  *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
-  *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
-}
-
 struct AV1Common;
 struct scale_factors;

--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@ -557,6 +557,7 @@ typedef uint8_t TXFM_CONTEXT;
 #define BWDREF_FRAME 5
 #define ALTREF2_FRAME 6
 #define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
 #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)

 #define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
@ -607,6 +608,7 @@ typedef enum ATTRIBUTE_PACKED {

 // In large_scale_tile coding, external references are used.
 #define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512

 #ifdef __cplusplus
 }  // extern "C"
--- a/third_party/aom/av1/common/filter.c
+++ b/third_party/aom/av1/common/filter.c
@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/filter.h"
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                bilinear_filters[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
-  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
-  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
-  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
-  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
-  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
-  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
-  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
-  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
-  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
-  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
-  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
-  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
-  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
-  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
-      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
-      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        MULTITAP_SHARP },
-      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR }
-    };
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
-  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
-  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
-  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
-  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
-  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
-  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
-  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
-};
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
-  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
-};
-
-static const InterpFilterParams av1_interp_4tap[2] = {
-  { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SMOOTH },
-};
-
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
-    const InterpFilter interp_filter, const int w) {
-  if (w <= 4 &&
-      (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
-    return av1_interp_4tap[0];
-  else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
-    return av1_interp_4tap[1];
-
-  return av1_interp_filter_params_list[interp_filter];
-}
-
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
-  return (const int16_t *)av1_interp_filter_params_list[interp_filter]
-      .filter_ptr;
-}
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@ -64,8 +64,8 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
  return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
 }

-#define LOG_SWITCHABLE_FILTERS \
-  2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2

 #define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
@ -79,14 +79,116 @@ typedef struct InterpFilterParams {
  InterpFilter interp_filter;
 } InterpFilterParams;

-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
+  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
+  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
+  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
+  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
+  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
+  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};

-InterpFilterParams av1_get_interp_filter_params_with_block_size(
-    const InterpFilter interp_filter, const int w);
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
+  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+                                             const int w) {
+  if (w <= 4) return &av1_interp_4tap[interp_filter];
+  return &av1_interp_filter_params_list[interp_filter];
+}
+
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+    const InterpFilter interp_filter) {
+  return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}

 static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
-    const InterpFilterParams filter_params, const int subpel) {
-  return filter_params.filter_ptr + filter_params.taps * subpel;
+    const InterpFilterParams *const filter_params, const int subpel) {
+  return filter_params->filter_ptr + filter_params->taps * subpel;
 }

 #ifdef __cplusplus
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@ -294,9 +294,6 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
  mv->row = clamp(mv->row, min_row, max_row);
 }

-static INLINE int mv_has_subpel(const MV *mv) {
-  return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
-}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/Показать больше
+++ b/Показать больше