Bug 1413734 - av1: Update decoder library. r=kinetik

Update to upstream commit id e87fb2378f01103d5d6e477a4ef6892dc714e614 from a couple of weeks ago to pick up changes. MozReview-Commit-ID: H7H69A7qFXD --HG-- extra : rebase_source : dee676da15b65e4eea612d20529c4fb312bbddfb
2017-11-01 15:36:09 -07:00 · 2017-11-01 15:36:09 -07:00 · 92f3d837b2
--- a/dom/media/platforms/agnostic/AOMDecoder.cpp
+++ b/dom/media/platforms/agnostic/AOMDecoder.cpp
@ -322,7 +322,7 @@ AOMDecoder::IsSupportedCodec(const nsAString& aCodecType)
  // for a specific aom commit hash so sites can check
  // compatibility.
  auto version = NS_ConvertASCIItoUTF16("av1.experimental.");
-  version.AppendLiteral("f5bdeac22930ff4c6b219be49c843db35970b918");
+  version.AppendLiteral("e87fb2378f01103d5d6e477a4ef6892dc714e614");
  return aCodecType.EqualsLiteral("av1") ||
         aCodecType.Equals(version);
 }
--- a/media/libaom/README_MOZILLA
+++ b/media/libaom/README_MOZILLA
@ -20,6 +20,6 @@ To update to a fork, use

  ./mach vendor aom --repo <repository url> [-r <commit>]

-The last update was pulled from https://aomedia.googlesource.com/aom
+The last update was pulled from https://aomedia.googlesource.com/aom/

-The git commit ID used was f5bdeac22930ff4c6b219be49c843db35970b918 (Thu Jul 27 18:23:37 2017 +0000).
+The git commit ID used was e87fb2378f01103d5d6e477a4ef6892dc714e614 (Tue Oct 10 19:20:52 2017 +0000).
--- a/media/libaom/config/generic/aom_config.asm
+++ b/media/libaom/config/generic/aom_config.asm
@ -49,8 +49,6 @@
 .equ CONFIG_STATIC_MSVCRT ,  0
 .equ CONFIG_SPATIAL_RESAMPLING ,  1
 .equ CONFIG_REALTIME_ONLY ,  0
-.equ CONFIG_ONTHEFLY_BITPACKING ,  0
-.equ CONFIG_ERROR_CONCEALMENT ,  0
 .equ CONFIG_SHARED ,  0
 .equ CONFIG_STATIC ,  1
 .equ CONFIG_SMALL ,  0
@ -63,73 +61,71 @@
 .equ CONFIG_INSPECTION ,  0
 .equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_SYMBOLRATE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_LOWBITDEPTH ,  1
 .equ CONFIG_HIGHBITDEPTH ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
-.equ CONFIG_COLORSPACE_HEADERS ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_CDEF ,  1
+.equ CONFIG_CDEF_SINGLEPASS ,  1
 .equ CONFIG_VAR_TX ,  1
 .equ CONFIG_RECT_TX ,  1
 .equ CONFIG_RECT_TX_EXT ,  0
 .equ CONFIG_TPL_MV ,  0
 .equ CONFIG_DUAL_FILTER ,  1
-.equ CONFIG_CONVOLVE_ROUND ,  0
+.equ CONFIG_CONVOLVE_ROUND ,  1
 .equ CONFIG_COMPOUND_ROUND ,  0
 .equ CONFIG_EXT_TX ,  1
-.equ CONFIG_DPCM_INTRA ,  0
 .equ CONFIG_TX64X64 ,  0
 .equ CONFIG_EXT_INTRA ,  1
 .equ CONFIG_INTRA_INTERP ,  0
 .equ CONFIG_FILTER_INTRA ,  0
-.equ CONFIG_INTRA_EDGE ,  0
+.equ CONFIG_INTRA_EDGE ,  1
 .equ CONFIG_INTRABC ,  0
-.equ CONFIG_EXT_INTER ,  1
 .equ CONFIG_INTERINTRA ,  1
 .equ CONFIG_WEDGE ,  1
 .equ CONFIG_COMPOUND_SEGMENT ,  1
 .equ CONFIG_EXT_REFS ,  1
-.equ CONFIG_ALTREF2 ,  0
-.equ CONFIG_SPEED_REFS ,  0
-.equ CONFIG_GF_GROUPS ,  0
-.equ CONFIG_FLEX_REFS ,  0
 .equ CONFIG_GLOBAL_MOTION ,  1
 .equ CONFIG_NEW_QUANT ,  0
 .equ CONFIG_SUPERTX ,  0
 .equ CONFIG_ANS ,  0
-.equ CONFIG_LOOP_RESTORATION ,  0
+.equ CONFIG_LOOP_RESTORATION ,  1
+.equ CONFIG_STRIPED_LOOP_RESTORATION ,  0
 .equ CONFIG_EXT_PARTITION ,  0
 .equ CONFIG_EXT_PARTITION_TYPES ,  0
+.equ CONFIG_EXT_PARTITION_TYPES_AB ,  0
 .equ CONFIG_UNPOISON_PARTITION_CTX ,  0
 .equ CONFIG_EXT_TILE ,  0
 .equ CONFIG_MOTION_VAR ,  1
 .equ CONFIG_NCOBMC ,  0
 .equ CONFIG_WARPED_MOTION ,  1
 .equ CONFIG_Q_ADAPT_PROBS ,  0
-.equ CONFIG_BITSTREAM_DEBUG ,  0
 .equ CONFIG_INTER_STATS_ONLY ,  0
-.equ CONFIG_ALT_INTRA ,  1
-.equ CONFIG_PALETTE ,  1
 .equ CONFIG_PALETTE_DELTA_ENCODING ,  0
 .equ CONFIG_RAWBITS ,  0
-.equ CONFIG_EC_SMALLMUL ,  1
+.equ CONFIG_KF_CTX ,  0
 .equ CONFIG_PVQ ,  0
 .equ CONFIG_CFL ,  0
 .equ CONFIG_XIPHRC ,  0
 .equ CONFIG_DCT_ONLY ,  0
+.equ CONFIG_DAALA_TX ,  0
 .equ CONFIG_DAALA_DCT4 ,  0
 .equ CONFIG_DAALA_DCT8 ,  0
+.equ CONFIG_DAALA_DCT16 ,  0
+.equ CONFIG_DAALA_DCT32 ,  0
+.equ CONFIG_DAALA_DCT64 ,  0
 .equ CONFIG_CB4X4 ,  1
 .equ CONFIG_CHROMA_2X2 ,  0
 .equ CONFIG_CHROMA_SUB8X8 ,  1
 .equ CONFIG_FRAME_SIZE ,  0
-.equ CONFIG_DELTA_Q ,  1
 .equ CONFIG_EXT_DELTA_Q ,  1
 .equ CONFIG_ADAPT_SCAN ,  0
-.equ CONFIG_FILTER_7BIT ,  1
 .equ CONFIG_PARALLEL_DEBLOCKING ,  1
+.equ CONFIG_DEBLOCK_13TAP ,  0
 .equ CONFIG_LOOPFILTERING_ACROSS_TILES ,  1
 .equ CONFIG_TEMPMV_SIGNALING ,  1
 .equ CONFIG_RD_DEBUG ,  0
@ -138,30 +134,47 @@
 .equ CONFIG_ENTROPY_STATS ,  0
 .equ CONFIG_MASKED_TX ,  0
 .equ CONFIG_DEPENDENT_HORZTILES ,  0
-.equ CONFIG_DIST_8X8 ,  0
-.equ CONFIG_DAALA_DIST ,  0
-.equ CONFIG_TRIPRED ,  0
+.equ CONFIG_DIST_8X8 ,  1
 .equ CONFIG_PALETTE_THROUGHPUT ,  1
 .equ CONFIG_REF_ADAPT ,  0
 .equ CONFIG_LV_MAP ,  0
+.equ CONFIG_CTX1D ,  0
 .equ CONFIG_TXK_SEL ,  0
 .equ CONFIG_MV_COMPRESS ,  1
+.equ CONFIG_SEGMENT_ZEROMV ,  0
 .equ CONFIG_FRAME_SUPERRES ,  0
 .equ CONFIG_NEW_MULTISYMBOL ,  0
 .equ CONFIG_COMPOUND_SINGLEREF ,  0
-.equ CONFIG_AOM_QM ,  0
+.equ CONFIG_AOM_QM ,  1
 .equ CONFIG_ONE_SIDED_COMPOUND ,  1
-.equ CONFIG_EXT_COMP_REFS ,  0
+.equ CONFIG_EXT_COMP_REFS ,  1
 .equ CONFIG_SMOOTH_HV ,  1
 .equ CONFIG_VAR_REFS ,  0
-.equ CONFIG_RECT_INTRA_PRED ,  1
 .equ CONFIG_LGT ,  0
+.equ CONFIG_LGT_FROM_PRED ,  0
 .equ CONFIG_SBL_SYMBOL ,  0
 .equ CONFIG_NCOBMC_ADAPT_WEIGHT ,  0
 .equ CONFIG_BGSPRITE ,  0
 .equ CONFIG_VAR_TX_NO_TX_MODE ,  0
 .equ CONFIG_MRC_TX ,  0
 .equ CONFIG_LPF_DIRECT ,  0
-.equ CONFIG_UV_LVL ,  0
+.equ CONFIG_LOOPFILTER_LEVEL ,  0
+.equ CONFIG_NO_FRAME_CONTEXT_SIGNALING ,  0
+.equ CONFIG_TXMG ,  1
+.equ CONFIG_MAX_TILE ,  0
+.equ CONFIG_HASH_ME ,  0
+.equ CONFIG_COLORSPACE_HEADERS ,  0
+.equ CONFIG_MFMV ,  0
+.equ CONFIG_FRAME_MARKER ,  0
+.equ CONFIG_JNT_COMP ,  0
+.equ CONFIG_FRAME_SIGN_BIAS ,  0
+.equ CONFIG_EXT_SKIP ,  0
+.equ CONFIG_OBU ,  0
+.equ CONFIG_AMVR ,  0
+.equ CONFIG_LPF_SB ,  0
+.equ CONFIG_OPT_REF_MV ,  0
+.equ CONFIG_TMV ,  0
+.equ CONFIG_RESTRICT_COMPRESSED_HDR ,  0
+.equ CONFIG_HORZONLY_FRAME_SUPERRES ,  0
 .equ CONFIG_ANALYZER ,  0
 	.section	.note.GNU-stack,"",%progbits
--- a/media/libaom/config/generic/aom_config.h
+++ b/media/libaom/config/generic/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/generic/aom_dsp_rtcd.h
+++ b/media/libaom/config/generic/aom_dsp_rtcd.h
@ -20,6 +20,9 @@
 extern "C" {
 #endif

+void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx);
+#define aom_blend_a64_d32_mask aom_blend_a64_d32_mask_c
+
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_c

@ -44,6 +47,24 @@ void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, int
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c

+void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src aom_convolve8_add_src_c
+
+void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_hip aom_convolve8_add_src_hip_c
+
+void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_horiz aom_convolve8_add_src_horiz_c
+
+void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_horiz_hip aom_convolve8_add_src_horiz_hip_c
+
+void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_vert aom_convolve8_add_src_vert_c
+
+void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_vert_hip aom_convolve8_add_src_vert_hip_c
+
 void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define aom_convolve8_avg aom_convolve8_avg_c

@ -1211,6 +1232,24 @@ void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred, const uint8_t *p
 void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define aom_highbd_convolve8 aom_highbd_convolve8_c

+void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src aom_highbd_convolve8_add_src_c
+
+void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_hip aom_highbd_convolve8_add_src_hip_c
+
+void aom_highbd_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_horiz aom_highbd_convolve8_add_src_horiz_c
+
+void aom_highbd_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_horiz_hip aom_highbd_convolve8_add_src_horiz_hip_c
+
+void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_vert aom_highbd_convolve8_add_src_vert_c
+
+void aom_highbd_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_vert_hip aom_highbd_convolve8_add_src_vert_hip_c
+
 void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define aom_highbd_convolve8_avg aom_highbd_convolve8_avg_c

--- a/media/libaom/config/generic/av1_rtcd.h
+++ b/media/libaom/config/generic/av1_rtcd.h
@ -31,39 +31,38 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-#define aom_clpf_block aom_clpf_block_c
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+#define apply_selfguided_restoration apply_selfguided_restoration_c

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-#define aom_clpf_block_hbd aom_clpf_block_hbd_c
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-#define aom_clpf_hblock aom_clpf_hblock_c
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-#define aom_clpf_hblock_hbd aom_clpf_hblock_hbd_c
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+#define apply_selfguided_restoration_highbd apply_selfguided_restoration_highbd_c

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define av1_block_error av1_block_error_c

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d av1_convolve_2d_c
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 #define av1_convolve_horiz av1_convolve_horiz_c

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+#define av1_convolve_rounding av1_convolve_rounding_c
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 #define av1_convolve_vert av1_convolve_vert_c

 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht16x16 av1_fht16x16_c

@ -106,46 +105,49 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht8x8 av1_fht8x8_c

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_c
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

 int av1_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 #define av1_full_search_sad av1_full_search_sad_c

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x16 av1_fwd_txfm2d_16x16_c

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x32 av1_fwd_txfm2d_32x32_c

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x4 av1_fwd_txfm2d_4x4_c

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-#define av1_fwd_txfm2d_64x64 av1_fwd_txfm2d_64x64_c
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x8 av1_fwd_txfm2d_8x8_c

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
@ -172,6 +174,12 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d av1_highbd_convolve_2d_c
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -184,6 +192,9 @@ void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride, uint16_t *
 void av1_highbd_convolve_init_c(void);
 #define av1_highbd_convolve_init av1_highbd_convolve_init_c

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+#define av1_highbd_convolve_rounding av1_highbd_convolve_rounding_c
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 #define av1_highbd_convolve_vert av1_highbd_convolve_vert_c

@ -229,9 +240,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 #define av1_highbd_quantize_fp av1_highbd_quantize_fp_c

@ -241,6 +249,12 @@ void av1_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_highpass_filter av1_highpass_filter_c
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_highpass_filter_highbd av1_highpass_filter_highbd_c
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 #define av1_iht16x16_256_add av1_iht16x16_256_add_c

@ -283,43 +297,40 @@ void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_iht8x8_64_add av1_iht8x8_64_add_c

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x4 av1_inv_txfm2d_add_4x4_c

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x8 av1_inv_txfm2d_add_8x8_c

 void av1_lowbd_convolve_init_c(void);
 #define av1_lowbd_convolve_init av1_lowbd_convolve_init_c

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -328,9 +339,21 @@ void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define av1_quantize_fp_32x32 av1_quantize_fp_32x32_c

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_selfguided_restoration av1_selfguided_restoration_c
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+#define av1_selfguided_restoration_highbd av1_selfguided_restoration_highbd_c
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 #define av1_temporal_filter_apply av1_temporal_filter_apply_c

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_c
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_warp_affine av1_warp_affine_c

@ -343,36 +366,21 @@ int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
 uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 #define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_c

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+#define cdef_filter_block cdef_filter_block_c
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+#define cdef_find_dir cdef_find_dir_c
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 #define compute_cross_correlation compute_cross_correlation_c

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-#define copy_4x4_16bit_to_16bit copy_4x4_16bit_to_16bit_c
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-#define copy_4x4_16bit_to_8bit copy_4x4_16bit_to_8bit_c
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-#define copy_8x8_16bit_to_16bit copy_8x8_16bit_to_16bit_c
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-#define copy_8x8_16bit_to_8bit copy_8x8_16bit_to_8bit_c
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 #define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_c

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 #define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_c

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-#define od_dir_find8 od_dir_find8_c
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-#define od_filter_dering_direction_4x4 od_filter_dering_direction_4x4_c
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-#define od_filter_dering_direction_8x8 od_filter_dering_direction_8x8_c
-
 void aom_rtcd(void);

 #include "aom_config.h"
--- a/media/libaom/config/linux/arm/aom_config.asm
+++ b/media/libaom/config/linux/arm/aom_config.asm
@ -49,8 +49,6 @@
 .equ CONFIG_STATIC_MSVCRT ,  0
 .equ CONFIG_SPATIAL_RESAMPLING ,  1
 .equ CONFIG_REALTIME_ONLY ,  1
-.equ CONFIG_ONTHEFLY_BITPACKING ,  0
-.equ CONFIG_ERROR_CONCEALMENT ,  0
 .equ CONFIG_SHARED ,  0
 .equ CONFIG_STATIC ,  1
 .equ CONFIG_SMALL ,  0
@ -63,73 +61,71 @@
 .equ CONFIG_INSPECTION ,  0
 .equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_SYMBOLRATE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_LOWBITDEPTH ,  1
 .equ CONFIG_HIGHBITDEPTH ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
-.equ CONFIG_COLORSPACE_HEADERS ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_CDEF ,  1
+.equ CONFIG_CDEF_SINGLEPASS ,  1
 .equ CONFIG_VAR_TX ,  1
 .equ CONFIG_RECT_TX ,  1
 .equ CONFIG_RECT_TX_EXT ,  0
 .equ CONFIG_TPL_MV ,  0
 .equ CONFIG_DUAL_FILTER ,  1
-.equ CONFIG_CONVOLVE_ROUND ,  0
+.equ CONFIG_CONVOLVE_ROUND ,  1
 .equ CONFIG_COMPOUND_ROUND ,  0
 .equ CONFIG_EXT_TX ,  1
-.equ CONFIG_DPCM_INTRA ,  0
 .equ CONFIG_TX64X64 ,  0
 .equ CONFIG_EXT_INTRA ,  1
 .equ CONFIG_INTRA_INTERP ,  0
 .equ CONFIG_FILTER_INTRA ,  0
-.equ CONFIG_INTRA_EDGE ,  0
+.equ CONFIG_INTRA_EDGE ,  1
 .equ CONFIG_INTRABC ,  0
-.equ CONFIG_EXT_INTER ,  1
 .equ CONFIG_INTERINTRA ,  1
 .equ CONFIG_WEDGE ,  1
 .equ CONFIG_COMPOUND_SEGMENT ,  1
 .equ CONFIG_EXT_REFS ,  1
-.equ CONFIG_ALTREF2 ,  0
-.equ CONFIG_SPEED_REFS ,  0
-.equ CONFIG_GF_GROUPS ,  0
-.equ CONFIG_FLEX_REFS ,  0
 .equ CONFIG_GLOBAL_MOTION ,  1
 .equ CONFIG_NEW_QUANT ,  0
 .equ CONFIG_SUPERTX ,  0
 .equ CONFIG_ANS ,  0
-.equ CONFIG_LOOP_RESTORATION ,  0
+.equ CONFIG_LOOP_RESTORATION ,  1
+.equ CONFIG_STRIPED_LOOP_RESTORATION ,  0
 .equ CONFIG_EXT_PARTITION ,  0
 .equ CONFIG_EXT_PARTITION_TYPES ,  0
+.equ CONFIG_EXT_PARTITION_TYPES_AB ,  0
 .equ CONFIG_UNPOISON_PARTITION_CTX ,  0
 .equ CONFIG_EXT_TILE ,  0
 .equ CONFIG_MOTION_VAR ,  1
 .equ CONFIG_NCOBMC ,  0
 .equ CONFIG_WARPED_MOTION ,  1
 .equ CONFIG_Q_ADAPT_PROBS ,  0
-.equ CONFIG_BITSTREAM_DEBUG ,  0
 .equ CONFIG_INTER_STATS_ONLY ,  0
-.equ CONFIG_ALT_INTRA ,  1
-.equ CONFIG_PALETTE ,  1
 .equ CONFIG_PALETTE_DELTA_ENCODING ,  0
 .equ CONFIG_RAWBITS ,  0
-.equ CONFIG_EC_SMALLMUL ,  1
+.equ CONFIG_KF_CTX ,  0
 .equ CONFIG_PVQ ,  0
 .equ CONFIG_CFL ,  0
 .equ CONFIG_XIPHRC ,  0
 .equ CONFIG_DCT_ONLY ,  0
+.equ CONFIG_DAALA_TX ,  0
 .equ CONFIG_DAALA_DCT4 ,  0
 .equ CONFIG_DAALA_DCT8 ,  0
+.equ CONFIG_DAALA_DCT16 ,  0
+.equ CONFIG_DAALA_DCT32 ,  0
+.equ CONFIG_DAALA_DCT64 ,  0
 .equ CONFIG_CB4X4 ,  1
 .equ CONFIG_CHROMA_2X2 ,  0
 .equ CONFIG_CHROMA_SUB8X8 ,  1
 .equ CONFIG_FRAME_SIZE ,  0
-.equ CONFIG_DELTA_Q ,  1
 .equ CONFIG_EXT_DELTA_Q ,  1
 .equ CONFIG_ADAPT_SCAN ,  0
-.equ CONFIG_FILTER_7BIT ,  1
 .equ CONFIG_PARALLEL_DEBLOCKING ,  1
+.equ CONFIG_DEBLOCK_13TAP ,  0
 .equ CONFIG_LOOPFILTERING_ACROSS_TILES ,  1
 .equ CONFIG_TEMPMV_SIGNALING ,  1
 .equ CONFIG_RD_DEBUG ,  0
@ -138,30 +134,47 @@
 .equ CONFIG_ENTROPY_STATS ,  0
 .equ CONFIG_MASKED_TX ,  0
 .equ CONFIG_DEPENDENT_HORZTILES ,  0
-.equ CONFIG_DIST_8X8 ,  0
-.equ CONFIG_DAALA_DIST ,  0
-.equ CONFIG_TRIPRED ,  0
+.equ CONFIG_DIST_8X8 ,  1
 .equ CONFIG_PALETTE_THROUGHPUT ,  1
 .equ CONFIG_REF_ADAPT ,  0
 .equ CONFIG_LV_MAP ,  0
+.equ CONFIG_CTX1D ,  0
 .equ CONFIG_TXK_SEL ,  0
 .equ CONFIG_MV_COMPRESS ,  1
+.equ CONFIG_SEGMENT_ZEROMV ,  0
 .equ CONFIG_FRAME_SUPERRES ,  0
 .equ CONFIG_NEW_MULTISYMBOL ,  0
 .equ CONFIG_COMPOUND_SINGLEREF ,  0
-.equ CONFIG_AOM_QM ,  0
+.equ CONFIG_AOM_QM ,  1
 .equ CONFIG_ONE_SIDED_COMPOUND ,  1
-.equ CONFIG_EXT_COMP_REFS ,  0
+.equ CONFIG_EXT_COMP_REFS ,  1
 .equ CONFIG_SMOOTH_HV ,  1
 .equ CONFIG_VAR_REFS ,  0
-.equ CONFIG_RECT_INTRA_PRED ,  1
 .equ CONFIG_LGT ,  0
+.equ CONFIG_LGT_FROM_PRED ,  0
 .equ CONFIG_SBL_SYMBOL ,  0
 .equ CONFIG_NCOBMC_ADAPT_WEIGHT ,  0
 .equ CONFIG_BGSPRITE ,  0
 .equ CONFIG_VAR_TX_NO_TX_MODE ,  0
 .equ CONFIG_MRC_TX ,  0
 .equ CONFIG_LPF_DIRECT ,  0
-.equ CONFIG_UV_LVL ,  0
+.equ CONFIG_LOOPFILTER_LEVEL ,  0
+.equ CONFIG_NO_FRAME_CONTEXT_SIGNALING ,  0
+.equ CONFIG_TXMG ,  1
+.equ CONFIG_MAX_TILE ,  0
+.equ CONFIG_HASH_ME ,  0
+.equ CONFIG_COLORSPACE_HEADERS ,  0
+.equ CONFIG_MFMV ,  0
+.equ CONFIG_FRAME_MARKER ,  0
+.equ CONFIG_JNT_COMP ,  0
+.equ CONFIG_FRAME_SIGN_BIAS ,  0
+.equ CONFIG_EXT_SKIP ,  0
+.equ CONFIG_OBU ,  0
+.equ CONFIG_AMVR ,  0
+.equ CONFIG_LPF_SB ,  0
+.equ CONFIG_OPT_REF_MV ,  0
+.equ CONFIG_TMV ,  0
+.equ CONFIG_RESTRICT_COMPRESSED_HDR ,  0
+.equ CONFIG_HORZONLY_FRAME_SUPERRES ,  0
 .equ CONFIG_ANALYZER ,  0
 	.section	.note.GNU-stack,"",%progbits
--- a/media/libaom/config/linux/arm/aom_config.h
+++ b/media/libaom/config/linux/arm/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 1
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/linux/arm/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/arm/aom_dsp_rtcd.h
@ -20,6 +20,9 @@
 extern "C" {
 #endif

+void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx);
+#define aom_blend_a64_d32_mask aom_blend_a64_d32_mask_c
+
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w);
 #define aom_blend_a64_hmask aom_blend_a64_hmask_c

@ -45,6 +48,24 @@ void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptr
 void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);

+void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src aom_convolve8_add_src_c
+
+void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_hip aom_convolve8_add_src_hip_c
+
+void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_horiz aom_convolve8_add_src_horiz_c
+
+void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_horiz_hip aom_convolve8_add_src_horiz_hip_c
+
+void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_vert aom_convolve8_add_src_vert_c
+
+void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_add_src_vert_hip aom_convolve8_add_src_vert_hip_c
+
 void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@ -1245,6 +1266,24 @@ void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred, const uint8_t *p
 void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define aom_highbd_convolve8 aom_highbd_convolve8_c

+void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src aom_highbd_convolve8_add_src_c
+
+void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_hip aom_highbd_convolve8_add_src_hip_c
+
+void aom_highbd_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_horiz aom_highbd_convolve8_add_src_horiz_c
+
+void aom_highbd_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_horiz_hip aom_highbd_convolve8_add_src_horiz_hip_c
+
+void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_vert aom_highbd_convolve8_add_src_vert_c
+
+void aom_highbd_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_add_src_vert_hip aom_highbd_convolve8_add_src_vert_hip_c
+
 void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define aom_highbd_convolve8_avg aom_highbd_convolve8_avg_c

@ -2244,52 +2283,40 @@ void aom_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 #define aom_iwht4x4_1_add aom_iwht4x4_1_add_c

 void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_c

 void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-RTCD_EXTERN void (*aom_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_c

 void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_c

 void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void aom_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-RTCD_EXTERN void (*aom_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_c

 void aom_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_edge_16 aom_lpf_horizontal_edge_16_c

 void aom_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_horizontal_edge_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_edge_8 aom_lpf_horizontal_edge_8_c

 void aom_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_16 aom_lpf_vertical_16_c

 void aom_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_16_dual aom_lpf_vertical_16_dual_c

 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_c

 void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-RTCD_EXTERN void (*aom_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_c

 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void aom_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_c

 void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-RTCD_EXTERN void (*aom_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_c

 unsigned int aom_masked_sad16x16_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask);
 #define aom_masked_sad16x16 aom_masked_sad16x16_c
@ -3126,30 +3153,6 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_NEON) aom_int_pro_col = aom_int_pro_col_neon;
    aom_int_pro_row = aom_int_pro_row_c;
    if (flags & HAS_NEON) aom_int_pro_row = aom_int_pro_row_neon;
-    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_neon;
-    aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_neon;
-    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_neon;
-    aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_neon;
-    aom_lpf_horizontal_edge_16 = aom_lpf_horizontal_edge_16_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_edge_16 = aom_lpf_horizontal_edge_16_neon;
-    aom_lpf_horizontal_edge_8 = aom_lpf_horizontal_edge_8_c;
-    if (flags & HAS_NEON) aom_lpf_horizontal_edge_8 = aom_lpf_horizontal_edge_8_neon;
-    aom_lpf_vertical_16 = aom_lpf_vertical_16_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_16 = aom_lpf_vertical_16_neon;
-    aom_lpf_vertical_16_dual = aom_lpf_vertical_16_dual_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_16_dual = aom_lpf_vertical_16_dual_neon;
-    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_4 = aom_lpf_vertical_4_neon;
-    aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_neon;
-    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_8 = aom_lpf_vertical_8_neon;
-    aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_c;
-    if (flags & HAS_NEON) aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_neon;
    aom_minmax_8x8 = aom_minmax_8x8_c;
    if (flags & HAS_NEON) aom_minmax_8x8 = aom_minmax_8x8_neon;
    aom_mse16x16 = aom_mse16x16_c;
--- a/media/libaom/config/linux/arm/av1_rtcd.h
+++ b/media/libaom/config/linux/arm/av1_rtcd.h
@ -31,43 +31,38 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_neon(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+#define apply_selfguided_restoration apply_selfguided_restoration_c

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_neon(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_neon(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_neon(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+#define apply_selfguided_restoration_highbd apply_selfguided_restoration_highbd_c

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define av1_block_error av1_block_error_c

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d av1_convolve_2d_c
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 #define av1_convolve_horiz av1_convolve_horiz_c

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+#define av1_convolve_rounding av1_convolve_rounding_c
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 #define av1_convolve_vert av1_convolve_vert_c

 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht16x16 av1_fht16x16_c

@ -110,46 +105,49 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht8x8 av1_fht8x8_c

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_c
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

 int av1_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 #define av1_full_search_sad av1_full_search_sad_c

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x16 av1_fwd_txfm2d_16x16_c

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x32 av1_fwd_txfm2d_32x32_c

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x4 av1_fwd_txfm2d_4x4_c

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-#define av1_fwd_txfm2d_64x64 av1_fwd_txfm2d_64x64_c
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x8 av1_fwd_txfm2d_8x8_c

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
@ -176,6 +174,12 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d av1_highbd_convolve_2d_c
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -188,6 +192,9 @@ void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride, uint16_t *
 void av1_highbd_convolve_init_c(void);
 #define av1_highbd_convolve_init av1_highbd_convolve_init_c

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+#define av1_highbd_convolve_rounding av1_highbd_convolve_rounding_c
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 #define av1_highbd_convolve_vert av1_highbd_convolve_vert_c

@ -233,9 +240,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 #define av1_highbd_quantize_fp av1_highbd_quantize_fp_c

@ -245,6 +249,12 @@ void av1_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_highbd_warp_affine av1_highbd_warp_affine_c

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_highpass_filter av1_highpass_filter_c
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_highpass_filter_highbd av1_highpass_filter_highbd_c
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 #define av1_iht16x16_256_add av1_iht16x16_256_add_c

@ -287,43 +297,40 @@ void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_iht8x8_64_add av1_iht8x8_64_add_c

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x4 av1_inv_txfm2d_add_4x4_c

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x8 av1_inv_txfm2d_add_8x8_c

 void av1_lowbd_convolve_init_c(void);
 #define av1_lowbd_convolve_init av1_lowbd_convolve_init_c

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -332,9 +339,21 @@ void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define av1_quantize_fp_32x32 av1_quantize_fp_32x32_c

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+#define av1_selfguided_restoration av1_selfguided_restoration_c
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+#define av1_selfguided_restoration_highbd av1_selfguided_restoration_highbd_c
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 #define av1_temporal_filter_apply av1_temporal_filter_apply_c

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_c
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 #define av1_warp_affine av1_warp_affine_c

@ -347,25 +366,17 @@ int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
 uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 #define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_c

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_neon(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 #define compute_cross_correlation compute_cross_correlation_c

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_neon(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_neon(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
@ -374,18 +385,6 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
 void copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_neon(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_neon(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_neon(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #include "aom_config.h"
@ -398,32 +397,14 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_c;
-    if (flags & HAS_NEON) aom_clpf_block = aom_clpf_block_neon;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_c;
-    if (flags & HAS_NEON) aom_clpf_block_hbd = aom_clpf_block_hbd_neon;
-    aom_clpf_hblock = aom_clpf_hblock_c;
-    if (flags & HAS_NEON) aom_clpf_hblock = aom_clpf_hblock_neon;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_c;
-    if (flags & HAS_NEON) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_neon;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_c;
-    if (flags & HAS_NEON) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_neon;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_c;
-    if (flags & HAS_NEON) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_neon;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_c;
-    if (flags & HAS_NEON) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_neon;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_c;
-    if (flags & HAS_NEON) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_neon;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_NEON) cdef_filter_block = cdef_filter_block_neon;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_NEON) cdef_find_dir = cdef_find_dir_neon;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
    if (flags & HAS_NEON) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_neon;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
    if (flags & HAS_NEON) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_neon;
-    od_dir_find8 = od_dir_find8_c;
-    if (flags & HAS_NEON) od_dir_find8 = od_dir_find8_neon;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_c;
-    if (flags & HAS_NEON) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_neon;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_c;
-    if (flags & HAS_NEON) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_neon;
 }
 #endif

--- a/media/libaom/config/linux/ia32/aom_config.asm
+++ b/media/libaom/config/linux/ia32/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,29 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
--- a/media/libaom/config/linux/ia32/aom_config.h
+++ b/media/libaom/config/linux/ia32/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/linux/ia32/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/ia32/aom_dsp_rtcd.h
--- a/media/libaom/config/linux/ia32/av1_rtcd.h
+++ b/media/libaom/config/linux/ia32/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 RTCD_EXTERN void (*av1_fht8x8)(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -207,6 +203,14 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -221,6 +225,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -267,9 +275,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -282,6 +287,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -334,48 +347,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 RTCD_EXTERN void (*av1_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -387,10 +397,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 RTCD_EXTERN void (*av1_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -408,64 +434,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 RTCD_EXTERN uint64_t (*av1_wedge_sse_from_residuals)(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -476,26 +476,20 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_c;
-    if (flags & HAS_SSE2) aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d = av1_convolve_2d_c;
+    if (flags & HAS_SSE2) av1_convolve_2d = av1_convolve_2d_sse2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_c;
@ -520,6 +514,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_fht8x4 = av1_fht8x4_sse2;
    av1_fht8x8 = av1_fht8x8_c;
    if (flags & HAS_SSE2) av1_fht8x8 = av1_fht8x8_sse2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -529,16 +527,20 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
    av1_highbd_block_error = av1_highbd_block_error_c;
    if (flags & HAS_SSE2) av1_highbd_block_error = av1_highbd_block_error_sse2;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -546,6 +548,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_c;
    if (flags & HAS_SSE2) av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
@ -580,8 +586,16 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
    av1_temporal_filter_apply = av1_temporal_filter_apply_c;
    if (flags & HAS_SSE2) av1_temporal_filter_apply = av1_temporal_filter_apply_sse2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_c;
    if (flags & HAS_SSE2) av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
@ -591,44 +605,28 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_wedge_sign_from_residuals = av1_wedge_sign_from_residuals_sse2;
    av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_c;
    if (flags & HAS_SSE2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_sse2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_c;
-    if (flags & HAS_SSE2) od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/config/linux/x64/aom_config.asm
+++ b/media/libaom/config/linux/x64/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,29 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
--- a/media/libaom/config/linux/x64/aom_config.h
+++ b/media/libaom/config/linux/x64/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/linux/x64/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/x64/aom_dsp_rtcd.h
--- a/media/libaom/config/linux/x64/av1_rtcd.h
+++ b/media/libaom/config/linux/x64/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d av1_convolve_2d_sse2
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht8x8 av1_fht8x8_sse2

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -213,6 +209,14 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -227,6 +231,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -273,9 +281,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -288,6 +293,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -340,48 +353,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_iht8x8_64_add av1_iht8x8_64_add_sse2

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -393,10 +403,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 #define av1_temporal_filter_apply av1_temporal_filter_apply_sse2

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -414,64 +440,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 #define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_sse2

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -482,28 +482,28 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_sse2;
    if (flags & HAS_AVX2) av1_fht16x16 = av1_fht16x16_avx2;
    av1_fht32x32 = av1_fht32x32_sse2;
    if (flags & HAS_AVX2) av1_fht32x32 = av1_fht32x32_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -513,14 +513,18 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -528,6 +532,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
    av1_inv_txfm2d_add_16x16 = av1_inv_txfm2d_add_16x16_c;
@ -544,37 +552,34 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/config/mac/x64/aom_config.asm
+++ b/media/libaom/config/mac/x64/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,29 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
--- a/media/libaom/config/mac/x64/aom_config.h
+++ b/media/libaom/config/mac/x64/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/mac/x64/aom_dsp_rtcd.h
+++ b/media/libaom/config/mac/x64/aom_dsp_rtcd.h
--- a/media/libaom/config/mac/x64/av1_rtcd.h
+++ b/media/libaom/config/mac/x64/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d av1_convolve_2d_sse2
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht8x8 av1_fht8x8_sse2

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -213,6 +209,14 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -227,6 +231,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -273,9 +281,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -288,6 +293,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -340,48 +353,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_iht8x8_64_add av1_iht8x8_64_add_sse2

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -393,10 +403,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 #define av1_temporal_filter_apply av1_temporal_filter_apply_sse2

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -414,64 +440,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 #define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_sse2

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -482,28 +482,28 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_sse2;
    if (flags & HAS_AVX2) av1_fht16x16 = av1_fht16x16_avx2;
    av1_fht32x32 = av1_fht32x32_sse2;
    if (flags & HAS_AVX2) av1_fht32x32 = av1_fht32x32_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -513,14 +513,18 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -528,6 +532,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
    av1_inv_txfm2d_add_16x16 = av1_inv_txfm2d_add_16x16_c;
@ -544,37 +552,34 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/config/win/ia32/aom_config.asm
+++ b/media/libaom/config/win/ia32/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,29 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
--- a/media/libaom/config/win/ia32/aom_config.h
+++ b/media/libaom/config/win/ia32/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/win/ia32/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/ia32/aom_dsp_rtcd.h
--- a/media/libaom/config/win/ia32/av1_rtcd.h
+++ b/media/libaom/config/win/ia32/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 RTCD_EXTERN void (*av1_fht8x8)(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -207,6 +203,14 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -221,6 +225,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -267,9 +275,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -282,6 +287,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -334,48 +347,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 RTCD_EXTERN void (*av1_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -387,10 +397,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 RTCD_EXTERN void (*av1_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -408,64 +434,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 RTCD_EXTERN uint64_t (*av1_wedge_sse_from_residuals)(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -476,26 +476,20 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_c;
-    if (flags & HAS_SSE2) aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d = av1_convolve_2d_c;
+    if (flags & HAS_SSE2) av1_convolve_2d = av1_convolve_2d_sse2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_c;
@ -520,6 +514,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_fht8x4 = av1_fht8x4_sse2;
    av1_fht8x8 = av1_fht8x8_c;
    if (flags & HAS_SSE2) av1_fht8x8 = av1_fht8x8_sse2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -529,16 +527,20 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
    av1_highbd_block_error = av1_highbd_block_error_c;
    if (flags & HAS_SSE2) av1_highbd_block_error = av1_highbd_block_error_sse2;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -546,6 +548,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_c;
    if (flags & HAS_SSE2) av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
@ -580,8 +586,16 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
    av1_temporal_filter_apply = av1_temporal_filter_apply_c;
    if (flags & HAS_SSE2) av1_temporal_filter_apply = av1_temporal_filter_apply_sse2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_c;
    if (flags & HAS_SSE2) av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
@ -591,44 +605,28 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_wedge_sign_from_residuals = av1_wedge_sign_from_residuals_sse2;
    av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_c;
    if (flags & HAS_SSE2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_sse2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_c;
-    if (flags & HAS_SSE2) od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/config/win/mingw32/aom_config.asm
+++ b/media/libaom/config/win/mingw32/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,30 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
-
--- a/media/libaom/config/win/mingw32/aom_config.h
+++ b/media/libaom/config/win/mingw32/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,33 +144,49 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* AOM_CONFIG_H */
-
--- a/media/libaom/config/win/mingw32/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/mingw32/aom_dsp_rtcd.h
--- a/media/libaom/config/win/mingw32/av1_rtcd.h
+++ b/media/libaom/config/win/mingw32/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 RTCD_EXTERN void (*av1_fht8x8)(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -207,6 +203,14 @@ void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint
 void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -221,6 +225,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -267,9 +275,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -282,6 +287,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -334,48 +347,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 RTCD_EXTERN void (*av1_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -387,10 +397,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 RTCD_EXTERN void (*av1_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -408,64 +434,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 RTCD_EXTERN uint64_t (*av1_wedge_sse_from_residuals)(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -476,26 +476,20 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_c;
-    if (flags & HAS_SSE2) aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_c;
-    if (flags & HAS_SSE2) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d = av1_convolve_2d_c;
+    if (flags & HAS_SSE2) av1_convolve_2d = av1_convolve_2d_sse2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_c;
@ -520,6 +514,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_fht8x4 = av1_fht8x4_sse2;
    av1_fht8x8 = av1_fht8x8_c;
    if (flags & HAS_SSE2) av1_fht8x8 = av1_fht8x8_sse2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -529,16 +527,20 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
    av1_highbd_block_error = av1_highbd_block_error_c;
    if (flags & HAS_SSE2) av1_highbd_block_error = av1_highbd_block_error_sse2;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -546,6 +548,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_c;
    if (flags & HAS_SSE2) av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
@ -580,8 +586,16 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
    av1_temporal_filter_apply = av1_temporal_filter_apply_c;
    if (flags & HAS_SSE2) av1_temporal_filter_apply = av1_temporal_filter_apply_sse2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_c;
    if (flags & HAS_SSE2) av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
@ -591,44 +605,28 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE2) av1_wedge_sign_from_residuals = av1_wedge_sign_from_residuals_sse2;
    av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_c;
    if (flags & HAS_SSE2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_sse2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_c;
-    if (flags & HAS_SSE2) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_c;
-    if (flags & HAS_SSE2) od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_c;
-    if (flags & HAS_SSE2) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/config/win/x64/aom_config.asm
+++ b/media/libaom/config/win/x64/aom_config.asm
@ -46,8 +46,6 @@ CONFIG_AV1 equ 1
 CONFIG_STATIC_MSVCRT equ 0
 CONFIG_SPATIAL_RESAMPLING equ 1
 CONFIG_REALTIME_ONLY equ 0
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
 CONFIG_SHARED equ 0
 CONFIG_STATIC equ 1
 CONFIG_SMALL equ 0
@ -60,73 +58,71 @@ CONFIG_ACCOUNTING equ 0
 CONFIG_INSPECTION equ 0
 CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_ENCODE_PERF_TESTS equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_SYMBOLRATE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_LOWBITDEPTH equ 1
 CONFIG_HIGHBITDEPTH equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 1
-CONFIG_COLORSPACE_HEADERS equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_CDEF equ 1
+CONFIG_CDEF_SINGLEPASS equ 1
 CONFIG_VAR_TX equ 1
 CONFIG_RECT_TX equ 1
 CONFIG_RECT_TX_EXT equ 0
 CONFIG_TPL_MV equ 0
 CONFIG_DUAL_FILTER equ 1
-CONFIG_CONVOLVE_ROUND equ 0
+CONFIG_CONVOLVE_ROUND equ 1
 CONFIG_COMPOUND_ROUND equ 0
 CONFIG_EXT_TX equ 1
-CONFIG_DPCM_INTRA equ 0
 CONFIG_TX64X64 equ 0
 CONFIG_EXT_INTRA equ 1
 CONFIG_INTRA_INTERP equ 0
 CONFIG_FILTER_INTRA equ 0
-CONFIG_INTRA_EDGE equ 0
+CONFIG_INTRA_EDGE equ 1
 CONFIG_INTRABC equ 0
-CONFIG_EXT_INTER equ 1
 CONFIG_INTERINTRA equ 1
 CONFIG_WEDGE equ 1
 CONFIG_COMPOUND_SEGMENT equ 1
 CONFIG_EXT_REFS equ 1
-CONFIG_ALTREF2 equ 0
-CONFIG_SPEED_REFS equ 0
-CONFIG_GF_GROUPS equ 0
-CONFIG_FLEX_REFS equ 0
 CONFIG_GLOBAL_MOTION equ 1
 CONFIG_NEW_QUANT equ 0
 CONFIG_SUPERTX equ 0
 CONFIG_ANS equ 0
-CONFIG_LOOP_RESTORATION equ 0
+CONFIG_LOOP_RESTORATION equ 1
+CONFIG_STRIPED_LOOP_RESTORATION equ 0
 CONFIG_EXT_PARTITION equ 0
 CONFIG_EXT_PARTITION_TYPES equ 0
+CONFIG_EXT_PARTITION_TYPES_AB equ 0
 CONFIG_UNPOISON_PARTITION_CTX equ 0
 CONFIG_EXT_TILE equ 0
 CONFIG_MOTION_VAR equ 1
 CONFIG_NCOBMC equ 0
 CONFIG_WARPED_MOTION equ 1
 CONFIG_Q_ADAPT_PROBS equ 0
-CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_INTER_STATS_ONLY equ 0
-CONFIG_ALT_INTRA equ 1
-CONFIG_PALETTE equ 1
 CONFIG_PALETTE_DELTA_ENCODING equ 0
 CONFIG_RAWBITS equ 0
-CONFIG_EC_SMALLMUL equ 1
+CONFIG_KF_CTX equ 0
 CONFIG_PVQ equ 0
 CONFIG_CFL equ 0
 CONFIG_XIPHRC equ 0
 CONFIG_DCT_ONLY equ 0
+CONFIG_DAALA_TX equ 0
 CONFIG_DAALA_DCT4 equ 0
 CONFIG_DAALA_DCT8 equ 0
+CONFIG_DAALA_DCT16 equ 0
+CONFIG_DAALA_DCT32 equ 0
+CONFIG_DAALA_DCT64 equ 0
 CONFIG_CB4X4 equ 1
 CONFIG_CHROMA_2X2 equ 0
 CONFIG_CHROMA_SUB8X8 equ 1
 CONFIG_FRAME_SIZE equ 0
-CONFIG_DELTA_Q equ 1
 CONFIG_EXT_DELTA_Q equ 1
 CONFIG_ADAPT_SCAN equ 0
-CONFIG_FILTER_7BIT equ 1
 CONFIG_PARALLEL_DEBLOCKING equ 1
+CONFIG_DEBLOCK_13TAP equ 0
 CONFIG_LOOPFILTERING_ACROSS_TILES equ 1
 CONFIG_TEMPMV_SIGNALING equ 1
 CONFIG_RD_DEBUG equ 0
@ -135,29 +131,46 @@ CONFIG_COEF_INTERLEAVE equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_MASKED_TX equ 0
 CONFIG_DEPENDENT_HORZTILES equ 0
-CONFIG_DIST_8X8 equ 0
-CONFIG_DAALA_DIST equ 0
-CONFIG_TRIPRED equ 0
+CONFIG_DIST_8X8 equ 1
 CONFIG_PALETTE_THROUGHPUT equ 1
 CONFIG_REF_ADAPT equ 0
 CONFIG_LV_MAP equ 0
+CONFIG_CTX1D equ 0
 CONFIG_TXK_SEL equ 0
 CONFIG_MV_COMPRESS equ 1
+CONFIG_SEGMENT_ZEROMV equ 0
 CONFIG_FRAME_SUPERRES equ 0
 CONFIG_NEW_MULTISYMBOL equ 0
 CONFIG_COMPOUND_SINGLEREF equ 0
-CONFIG_AOM_QM equ 0
+CONFIG_AOM_QM equ 1
 CONFIG_ONE_SIDED_COMPOUND equ 1
-CONFIG_EXT_COMP_REFS equ 0
+CONFIG_EXT_COMP_REFS equ 1
 CONFIG_SMOOTH_HV equ 1
 CONFIG_VAR_REFS equ 0
-CONFIG_RECT_INTRA_PRED equ 1
 CONFIG_LGT equ 0
+CONFIG_LGT_FROM_PRED equ 0
 CONFIG_SBL_SYMBOL equ 0
 CONFIG_NCOBMC_ADAPT_WEIGHT equ 0
 CONFIG_BGSPRITE equ 0
 CONFIG_VAR_TX_NO_TX_MODE equ 0
 CONFIG_MRC_TX equ 0
 CONFIG_LPF_DIRECT equ 0
-CONFIG_UV_LVL equ 0
+CONFIG_LOOPFILTER_LEVEL equ 0
+CONFIG_NO_FRAME_CONTEXT_SIGNALING equ 0
+CONFIG_TXMG equ 1
+CONFIG_MAX_TILE equ 0
+CONFIG_HASH_ME equ 0
+CONFIG_COLORSPACE_HEADERS equ 0
+CONFIG_MFMV equ 0
+CONFIG_FRAME_MARKER equ 0
+CONFIG_JNT_COMP equ 0
+CONFIG_FRAME_SIGN_BIAS equ 0
+CONFIG_EXT_SKIP equ 0
+CONFIG_OBU equ 0
+CONFIG_AMVR equ 0
+CONFIG_LPF_SB equ 0
+CONFIG_OPT_REF_MV equ 0
+CONFIG_TMV equ 0
+CONFIG_RESTRICT_COMPRESSED_HDR equ 0
+CONFIG_HORZONLY_FRAME_SUPERRES equ 0
 CONFIG_ANALYZER equ 0
--- a/media/libaom/config/win/x64/aom_config.h
+++ b/media/libaom/config/win/x64/aom_config.h
@ -59,8 +59,6 @@
 #define CONFIG_STATIC_MSVCRT 0
 #define CONFIG_SPATIAL_RESAMPLING 1
 #define CONFIG_REALTIME_ONLY 0
-#define CONFIG_ONTHEFLY_BITPACKING 0
-#define CONFIG_ERROR_CONCEALMENT 0
 #define CONFIG_SHARED 0
 #define CONFIG_STATIC 1
 #define CONFIG_SMALL 0
@ -73,73 +71,71 @@
 #define CONFIG_INSPECTION 0
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_SYMBOLRATE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_LOWBITDEPTH 1
 #define CONFIG_HIGHBITDEPTH 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_COLORSPACE_HEADERS 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_CDEF 1
+#define CONFIG_CDEF_SINGLEPASS 1
 #define CONFIG_VAR_TX 1
 #define CONFIG_RECT_TX 1
 #define CONFIG_RECT_TX_EXT 0
 #define CONFIG_TPL_MV 0
 #define CONFIG_DUAL_FILTER 1
-#define CONFIG_CONVOLVE_ROUND 0
+#define CONFIG_CONVOLVE_ROUND 1
 #define CONFIG_COMPOUND_ROUND 0
 #define CONFIG_EXT_TX 1
-#define CONFIG_DPCM_INTRA 0
 #define CONFIG_TX64X64 0
 #define CONFIG_EXT_INTRA 1
 #define CONFIG_INTRA_INTERP 0
 #define CONFIG_FILTER_INTRA 0
-#define CONFIG_INTRA_EDGE 0
+#define CONFIG_INTRA_EDGE 1
 #define CONFIG_INTRABC 0
-#define CONFIG_EXT_INTER 1
 #define CONFIG_INTERINTRA 1
 #define CONFIG_WEDGE 1
 #define CONFIG_COMPOUND_SEGMENT 1
 #define CONFIG_EXT_REFS 1
-#define CONFIG_ALTREF2 0
-#define CONFIG_SPEED_REFS 0
-#define CONFIG_GF_GROUPS 0
-#define CONFIG_FLEX_REFS 0
 #define CONFIG_GLOBAL_MOTION 1
 #define CONFIG_NEW_QUANT 0
 #define CONFIG_SUPERTX 0
 #define CONFIG_ANS 0
-#define CONFIG_LOOP_RESTORATION 0
+#define CONFIG_LOOP_RESTORATION 1
+#define CONFIG_STRIPED_LOOP_RESTORATION 0
 #define CONFIG_EXT_PARTITION 0
 #define CONFIG_EXT_PARTITION_TYPES 0
+#define CONFIG_EXT_PARTITION_TYPES_AB 0
 #define CONFIG_UNPOISON_PARTITION_CTX 0
 #define CONFIG_EXT_TILE 0
 #define CONFIG_MOTION_VAR 1
 #define CONFIG_NCOBMC 0
 #define CONFIG_WARPED_MOTION 1
 #define CONFIG_Q_ADAPT_PROBS 0
-#define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_INTER_STATS_ONLY 0
-#define CONFIG_ALT_INTRA 1
-#define CONFIG_PALETTE 1
 #define CONFIG_PALETTE_DELTA_ENCODING 0
 #define CONFIG_RAWBITS 0
-#define CONFIG_EC_SMALLMUL 1
+#define CONFIG_KF_CTX 0
 #define CONFIG_PVQ 0
 #define CONFIG_CFL 0
 #define CONFIG_XIPHRC 0
 #define CONFIG_DCT_ONLY 0
+#define CONFIG_DAALA_TX 0
 #define CONFIG_DAALA_DCT4 0
 #define CONFIG_DAALA_DCT8 0
+#define CONFIG_DAALA_DCT16 0
+#define CONFIG_DAALA_DCT32 0
+#define CONFIG_DAALA_DCT64 0
 #define CONFIG_CB4X4 1
 #define CONFIG_CHROMA_2X2 0
 #define CONFIG_CHROMA_SUB8X8 1
 #define CONFIG_FRAME_SIZE 0
-#define CONFIG_DELTA_Q 1
 #define CONFIG_EXT_DELTA_Q 1
 #define CONFIG_ADAPT_SCAN 0
-#define CONFIG_FILTER_7BIT 1
 #define CONFIG_PARALLEL_DEBLOCKING 1
+#define CONFIG_DEBLOCK_13TAP 0
 #define CONFIG_LOOPFILTERING_ACROSS_TILES 1
 #define CONFIG_TEMPMV_SIGNALING 1
 #define CONFIG_RD_DEBUG 0
@ -148,31 +144,48 @@
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_MASKED_TX 0
 #define CONFIG_DEPENDENT_HORZTILES 0
-#define CONFIG_DIST_8X8 0
-#define CONFIG_DAALA_DIST 0
-#define CONFIG_TRIPRED 0
+#define CONFIG_DIST_8X8 1
 #define CONFIG_PALETTE_THROUGHPUT 1
 #define CONFIG_REF_ADAPT 0
 #define CONFIG_LV_MAP 0
+#define CONFIG_CTX1D 0
 #define CONFIG_TXK_SEL 0
 #define CONFIG_MV_COMPRESS 1
+#define CONFIG_SEGMENT_ZEROMV 0
 #define CONFIG_FRAME_SUPERRES 0
 #define CONFIG_NEW_MULTISYMBOL 0
 #define CONFIG_COMPOUND_SINGLEREF 0
-#define CONFIG_AOM_QM 0
+#define CONFIG_AOM_QM 1
 #define CONFIG_ONE_SIDED_COMPOUND 1
-#define CONFIG_EXT_COMP_REFS 0
+#define CONFIG_EXT_COMP_REFS 1
 #define CONFIG_SMOOTH_HV 1
 #define CONFIG_VAR_REFS 0
-#define CONFIG_RECT_INTRA_PRED 1
 #define CONFIG_LGT 0
+#define CONFIG_LGT_FROM_PRED 0
 #define CONFIG_SBL_SYMBOL 0
 #define CONFIG_NCOBMC_ADAPT_WEIGHT 0
 #define CONFIG_BGSPRITE 0
 #define CONFIG_VAR_TX_NO_TX_MODE 0
 #define CONFIG_MRC_TX 0
 #define CONFIG_LPF_DIRECT 0
-#define CONFIG_UV_LVL 0
+#define CONFIG_LOOPFILTER_LEVEL 0
+#define CONFIG_NO_FRAME_CONTEXT_SIGNALING 0
+#define CONFIG_TXMG 1
+#define CONFIG_MAX_TILE 0
+#define CONFIG_HASH_ME 0
+#define CONFIG_COLORSPACE_HEADERS 0
+#define CONFIG_MFMV 0
+#define CONFIG_FRAME_MARKER 0
+#define CONFIG_JNT_COMP 0
+#define CONFIG_FRAME_SIGN_BIAS 0
+#define CONFIG_EXT_SKIP 0
+#define CONFIG_OBU 0
+#define CONFIG_AMVR 0
+#define CONFIG_LPF_SB 0
+#define CONFIG_OPT_REF_MV 0
+#define CONFIG_TMV 0
+#define CONFIG_RESTRICT_COMPRESSED_HDR 0
+#define CONFIG_HORZONLY_FRAME_SUPERRES 0
 #define CONFIG_ANALYZER 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
--- a/media/libaom/config/win/x64/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/x64/aom_dsp_rtcd.h
--- a/media/libaom/config/win/x64/av1_rtcd.h
+++ b/media/libaom/config/win/x64/av1_rtcd.h
@ -31,44 +31,39 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-typedef uint16_t od_dering_in;

 #ifdef __cplusplus
 extern "C" {
 #endif

-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration)(uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf);

-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_block_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse2(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_ssse3(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_sse4_1(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse2(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_ssse3(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-void aom_clpf_hblock_hbd_sse4_1(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
-RTCD_EXTERN void (*aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd);
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+void apply_selfguided_restoration_highbd_sse4_1(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);
+RTCD_EXTERN void (*apply_selfguided_restoration_highbd)(uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf);

 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*av1_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);

+void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d av1_convolve_2d_sse2
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_horiz)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);

+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+RTCD_EXTERN void (*av1_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits);
+
 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params);
@ -76,9 +71,6 @@ RTCD_EXTERN void (*av1_convolve_vert)(const uint8_t *src, int src_stride, uint8_
 int av1_diamond_search_sad_c(struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_diamond_search_sad av1_diamond_search_sad_c

-void av1_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define av1_fdct8x8_quant av1_fdct8x8_quant_c
-
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
@ -133,6 +125,14 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, struct t
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param);
 #define av1_fht8x8 av1_fht8x8_sse2

+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
 int av1_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define av1_full_range_search av1_full_range_search_c

@ -141,46 +141,42 @@ int av1_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, i
 int av1_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
 RTCD_EXTERN int (*av1_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);

-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type);
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type);
 #define av1_fwd_idtx av1_fwd_idtx_c

-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_16x16)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x32 av1_fwd_txfm2d_16x32_c

-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_16x8 av1_fwd_txfm2d_16x8_c

-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_32x16 av1_fwd_txfm2d_32x16_c

-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_32x32)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_4x4)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_4x8 av1_fwd_txfm2d_4x8_c

-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_64x64)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x16 av1_fwd_txfm2d_8x16_c

-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_fwd_txfm2d_8x4 av1_fwd_txfm2d_8x4_c

-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, int tx_type, int bd);
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_fwd_txfm2d_8x8)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define av1_fwht4x4 av1_fwht4x4_c
@ -213,6 +209,14 @@ void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8
 void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2

+void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
 #define av1_highbd_convolve_avg av1_highbd_convolve_avg_c

@ -227,6 +231,10 @@ void av1_highbd_convolve_init_c(void);
 void av1_highbd_convolve_init_sse4_1(void);
 RTCD_EXTERN void (*av1_highbd_convolve_init)(void);

+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_rounding)(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd);
+
 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
 RTCD_EXTERN void (*av1_highbd_convolve_vert)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd);
@ -273,9 +281,6 @@ void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int dest
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_highbd_iht8x8_64_add av1_highbd_iht8x8_64_add_c

-void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
-#define av1_highbd_quantize_b av1_highbd_quantize_b_c
-
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
 void av1_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
@ -288,6 +293,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width
 void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);

+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_highpass_filter_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param);
@ -340,48 +353,45 @@ void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param);
 #define av1_iht8x8_64_add av1_iht8x8_64_add_sse2

-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_16x16)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c

-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c

-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c

-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_32x32)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c

-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
-
-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c

-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
 #define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c

-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, int tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_convolve_init_c(void);
 void av1_lowbd_convolve_init_ssse3(void);
 RTCD_EXTERN void (*av1_lowbd_convolve_init)(void);

-void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale);
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale);
 #define av1_quantize_b av1_quantize_b_c

 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@ -393,10 +403,26 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void av1_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*av1_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);

+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration)(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps);
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+RTCD_EXTERN void (*av1_selfguided_restoration_highbd)(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps);
+
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 void av1_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
 #define av1_temporal_filter_apply av1_temporal_filter_apply_sse2

+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
@ -414,64 +440,38 @@ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, con
 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N);
 #define av1_wedge_sse_from_residuals av1_wedge_sse_from_residuals_sse2

+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
 double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);
 RTCD_EXTERN double (*compute_cross_correlation)(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2);

-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_4x4_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride);
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse2(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_ssse3(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-void copy_8x8_16bit_to_8bit_sse4_1(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-RTCD_EXTERN void (*copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride, const uint16_t *src, int sstride);
-
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);

 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
 RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);

-int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse2(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_ssse3(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-RTCD_EXTERN int (*od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, int coeff_shift);
-
-void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
-void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse2(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_ssse3(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-void od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-RTCD_EXTERN void (*od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping);
-
 void aom_rtcd(void);

 #ifdef RTCD_C
@ -482,28 +482,28 @@ static void setup_rtcd_internal(void)

    (void)flags;

-    aom_clpf_block = aom_clpf_block_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block = aom_clpf_block_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block = aom_clpf_block_sse4_1;
-    aom_clpf_block_hbd = aom_clpf_block_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_block_hbd = aom_clpf_block_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_block_hbd = aom_clpf_block_hbd_sse4_1;
-    aom_clpf_hblock = aom_clpf_hblock_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock = aom_clpf_hblock_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock = aom_clpf_hblock_sse4_1;
-    aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse2;
-    if (flags & HAS_SSSE3) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_ssse3;
-    if (flags & HAS_SSE4_1) aom_clpf_hblock_hbd = aom_clpf_hblock_hbd_sse4_1;
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration_highbd = apply_selfguided_restoration_highbd_sse4_1;
    av1_block_error = av1_block_error_c;
    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
    av1_convolve_horiz = av1_convolve_horiz_c;
    if (flags & HAS_SSSE3) av1_convolve_horiz = av1_convolve_horiz_ssse3;
+    av1_convolve_rounding = av1_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_convolve_rounding = av1_convolve_rounding_avx2;
    av1_convolve_vert = av1_convolve_vert_c;
    if (flags & HAS_SSSE3) av1_convolve_vert = av1_convolve_vert_ssse3;
    av1_fht16x16 = av1_fht16x16_sse2;
    if (flags & HAS_AVX2) av1_fht16x16 = av1_fht16x16_avx2;
    av1_fht32x32 = av1_fht32x32_sse2;
    if (flags & HAS_AVX2) av1_fht32x32 = av1_fht32x32_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
    av1_full_search_sad = av1_full_search_sad_c;
    if (flags & HAS_SSE3) av1_full_search_sad = av1_full_search_sadx3;
    if (flags & HAS_SSE4_1) av1_full_search_sad = av1_full_search_sadx8;
@ -513,14 +513,18 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
-    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
-    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
+    av1_highbd_convolve_2d = av1_highbd_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d = av1_highbd_convolve_2d_ssse3;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
    av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz = av1_highbd_convolve_horiz_sse4_1;
    av1_highbd_convolve_init = av1_highbd_convolve_init_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_init = av1_highbd_convolve_init_sse4_1;
+    av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_c;
+    if (flags & HAS_AVX2) av1_highbd_convolve_rounding = av1_highbd_convolve_rounding_avx2;
    av1_highbd_convolve_vert = av1_highbd_convolve_vert_c;
    if (flags & HAS_SSE4_1) av1_highbd_convolve_vert = av1_highbd_convolve_vert_sse4_1;
    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
@ -528,6 +532,10 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
    if (flags & HAS_SSSE3) av1_highbd_warp_affine = av1_highbd_warp_affine_ssse3;
+    av1_highpass_filter = av1_highpass_filter_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter = av1_highpass_filter_sse4_1;
+    av1_highpass_filter_highbd = av1_highpass_filter_highbd_c;
+    if (flags & HAS_SSE4_1) av1_highpass_filter_highbd = av1_highpass_filter_highbd_sse4_1;
    av1_iht16x16_256_add = av1_iht16x16_256_add_sse2;
    if (flags & HAS_AVX2) av1_iht16x16_256_add = av1_iht16x16_256_add_avx2;
    av1_inv_txfm2d_add_16x16 = av1_inv_txfm2d_add_16x16_c;
@ -544,37 +552,34 @@ static void setup_rtcd_internal(void)
    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration_highbd = av1_selfguided_restoration_highbd_sse4_1;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
    av1_warp_affine = av1_warp_affine_sse2;
    if (flags & HAS_SSSE3) av1_warp_affine = av1_warp_affine_ssse3;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
    compute_cross_correlation = compute_cross_correlation_c;
    if (flags & HAS_SSE4_1) compute_cross_correlation = compute_cross_correlation_sse4_1;
-    copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_16bit = copy_4x4_16bit_to_16bit_sse4_1;
-    copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_4x4_16bit_to_8bit = copy_4x4_16bit_to_8bit_sse4_1;
-    copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_16bit = copy_8x8_16bit_to_16bit_sse4_1;
-    copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse2;
-    if (flags & HAS_SSSE3) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_ssse3;
-    if (flags & HAS_SSE4_1) copy_8x8_16bit_to_8bit = copy_8x8_16bit_to_8bit_sse4_1;
    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
-    od_dir_find8 = od_dir_find8_sse2;
-    if (flags & HAS_SSSE3) od_dir_find8 = od_dir_find8_ssse3;
-    if (flags & HAS_SSE4_1) od_dir_find8 = od_dir_find8_sse4_1;
-    od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_4x4 = od_filter_dering_direction_4x4_sse4_1;
-    od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse2;
-    if (flags & HAS_SSSE3) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_ssse3;
-    if (flags & HAS_SSE4_1) od_filter_dering_direction_8x8 = od_filter_dering_direction_8x8_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
 }
 #endif

--- a/media/libaom/sources.mozbuild
+++ b/media/libaom/sources.mozbuild
@ -54,8 +54,10 @@ files = {
    '../../third_party/aom/aom_dsp/variance.c',
    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
@ -74,7 +76,11 @@ files = {
    '../../third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c',
@ -85,13 +91,15 @@ files = {
    '../../third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/highbd_variance_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_variance_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_avx2.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_sse2.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
-    '../../third_party/aom/aom_dsp/x86/loopfilter_avx2.c',
    '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
    '../../third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c',
@ -138,10 +146,11 @@ files = {
    '../../third_party/aom/av1/common/av1_rtcd.c',
    '../../third_party/aom/av1/common/blockd.c',
    '../../third_party/aom/av1/common/cdef.c',
-    '../../third_party/aom/av1/common/clpf.c',
-    '../../third_party/aom/av1/common/clpf_sse2.c',
-    '../../third_party/aom/av1/common/clpf_sse4.c',
-    '../../third_party/aom/av1/common/clpf_ssse3.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_avx2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse4.c',
+    '../../third_party/aom/av1/common/cdef_block_ssse3.c',
    '../../third_party/aom/av1/common/convolve.c',
    '../../third_party/aom/av1/common/daala_tx.c',
    '../../third_party/aom/av1/common/debugmodes.c',
@ -152,16 +161,13 @@ files = {
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
-    '../../third_party/aom/av1/common/od_dering.c',
-    '../../third_party/aom/av1/common/od_dering_sse2.c',
-    '../../third_party/aom/av1/common/od_dering_sse4.c',
-    '../../third_party/aom/av1/common/od_dering_ssse3.c',
    '../../third_party/aom/av1/common/odintrin.c',
    '../../third_party/aom/av1/common/pred_common.c',
    '../../third_party/aom/av1/common/quant_common.c',
    '../../third_party/aom/av1/common/reconinter.c',
    '../../third_party/aom/av1/common/reconintra.c',
    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
    '../../third_party/aom/av1/common/scale.c',
    '../../third_party/aom/av1/common/scan.c',
    '../../third_party/aom/av1/common/seg_common.c',
@ -172,12 +178,16 @@ files = {
    '../../third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c',
    '../../third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c',
    '../../third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_sse2.c',
    '../../third_party/aom/av1/common/x86/convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c',
    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c',
    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c',
    '../../third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c',
    '../../third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c',
    '../../third_party/aom/av1/common/x86/idct_intrin_sse2.c',
+    '../../third_party/aom/av1/common/x86/intra_edge_sse4.c',
+    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
    '../../third_party/aom/av1/common/x86/warp_plane_sse2.c',
    '../../third_party/aom/av1/common/x86/warp_plane_ssse3.c',
    '../../third_party/aom/av1/decoder/decodeframe.c',
@ -204,6 +214,7 @@ files = {
    '../../third_party/aom/av1/encoder/extend.c',
    '../../third_party/aom/av1/encoder/firstpass.c',
    '../../third_party/aom/av1/encoder/global_motion.c',
+    '../../third_party/aom/av1/encoder/hash.c',
    '../../third_party/aom/av1/encoder/hybrid_fwd_txfm.c',
    '../../third_party/aom/av1/encoder/lookahead.c',
    '../../third_party/aom/av1/encoder/mbgraph.c',
@ -211,6 +222,7 @@ files = {
    '../../third_party/aom/av1/encoder/palette.c',
    '../../third_party/aom/av1/encoder/pickcdef.c',
    '../../third_party/aom/av1/encoder/picklpf.c',
+    '../../third_party/aom/av1/encoder/pickrst.c',
    '../../third_party/aom/av1/encoder/ransac.c',
    '../../third_party/aom/av1/encoder/ratectrl.c',
    '../../third_party/aom/av1/encoder/rd.c',
@ -230,7 +242,6 @@ files = {
    '../../third_party/aom/av1/encoder/x86/corner_match_sse4.c',
    '../../third_party/aom/av1/encoder/x86/dct_intrin_sse2.c',
    '../../third_party/aom/av1/encoder/x86/dct_sse2.asm',
-    '../../third_party/aom/av1/encoder/x86/dct_ssse3.c',
    '../../third_party/aom/av1/encoder/x86/error_intrin_avx2.c',
    '../../third_party/aom/av1/encoder/x86/error_sse2.asm',
    '../../third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c',
@ -292,8 +303,10 @@ files = {
    '../../third_party/aom/aom_dsp/variance.c',
    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
@ -310,7 +323,11 @@ files = {
    '../../third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c',
@ -321,13 +338,15 @@ files = {
    '../../third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm',
    '../../third_party/aom/aom_dsp/x86/highbd_variance_sse2.c',
    '../../third_party/aom/aom_dsp/x86/highbd_variance_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_avx2.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_sse2.c',
    '../../third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
-    '../../third_party/aom/aom_dsp/x86/loopfilter_avx2.c',
    '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
    '../../third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c',
    '../../third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c',
@ -371,10 +390,11 @@ files = {
    '../../third_party/aom/av1/common/av1_rtcd.c',
    '../../third_party/aom/av1/common/blockd.c',
    '../../third_party/aom/av1/common/cdef.c',
-    '../../third_party/aom/av1/common/clpf.c',
-    '../../third_party/aom/av1/common/clpf_sse2.c',
-    '../../third_party/aom/av1/common/clpf_sse4.c',
-    '../../third_party/aom/av1/common/clpf_ssse3.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_avx2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse4.c',
+    '../../third_party/aom/av1/common/cdef_block_ssse3.c',
    '../../third_party/aom/av1/common/convolve.c',
    '../../third_party/aom/av1/common/daala_tx.c',
    '../../third_party/aom/av1/common/debugmodes.c',
@ -385,16 +405,13 @@ files = {
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
-    '../../third_party/aom/av1/common/od_dering.c',
-    '../../third_party/aom/av1/common/od_dering_sse2.c',
-    '../../third_party/aom/av1/common/od_dering_sse4.c',
-    '../../third_party/aom/av1/common/od_dering_ssse3.c',
    '../../third_party/aom/av1/common/odintrin.c',
    '../../third_party/aom/av1/common/pred_common.c',
    '../../third_party/aom/av1/common/quant_common.c',
    '../../third_party/aom/av1/common/reconinter.c',
    '../../third_party/aom/av1/common/reconintra.c',
    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
    '../../third_party/aom/av1/common/scale.c',
    '../../third_party/aom/av1/common/scan.c',
    '../../third_party/aom/av1/common/seg_common.c',
@ -405,12 +422,16 @@ files = {
    '../../third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c',
    '../../third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c',
    '../../third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_sse2.c',
    '../../third_party/aom/av1/common/x86/convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c',
    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c',
    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c',
    '../../third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c',
    '../../third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c',
    '../../third_party/aom/av1/common/x86/idct_intrin_sse2.c',
+    '../../third_party/aom/av1/common/x86/intra_edge_sse4.c',
+    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
    '../../third_party/aom/av1/common/x86/warp_plane_sse2.c',
    '../../third_party/aom/av1/common/x86/warp_plane_ssse3.c',
    '../../third_party/aom/av1/decoder/decodeframe.c',
@ -437,6 +458,7 @@ files = {
    '../../third_party/aom/av1/encoder/extend.c',
    '../../third_party/aom/av1/encoder/firstpass.c',
    '../../third_party/aom/av1/encoder/global_motion.c',
+    '../../third_party/aom/av1/encoder/hash.c',
    '../../third_party/aom/av1/encoder/hybrid_fwd_txfm.c',
    '../../third_party/aom/av1/encoder/lookahead.c',
    '../../third_party/aom/av1/encoder/mbgraph.c',
@ -444,6 +466,7 @@ files = {
    '../../third_party/aom/av1/encoder/palette.c',
    '../../third_party/aom/av1/encoder/pickcdef.c',
    '../../third_party/aom/av1/encoder/picklpf.c',
+    '../../third_party/aom/av1/encoder/pickrst.c',
    '../../third_party/aom/av1/encoder/ransac.c',
    '../../third_party/aom/av1/encoder/ratectrl.c',
    '../../third_party/aom/av1/encoder/rd.c',
@ -462,7 +485,6 @@ files = {
    '../../third_party/aom/av1/encoder/x86/corner_match_sse4.c',
    '../../third_party/aom/av1/encoder/x86/dct_intrin_sse2.c',
    '../../third_party/aom/av1/encoder/x86/dct_sse2.asm',
-    '../../third_party/aom/av1/encoder/x86/dct_ssse3.c',
    '../../third_party/aom/av1/encoder/x86/error_intrin_avx2.c',
    '../../third_party/aom/av1/encoder/x86/error_sse2.asm',
    '../../third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c',
@ -517,11 +539,6 @@ files = {
    '../../third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm',
    '../../third_party/aom/aom_dsp/arm/intrapred_neon.c',
    '../../third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm',
-    '../../third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm',
-    '../../third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm',
-    '../../third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm',
-    '../../third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm',
-    '../../third_party/aom/aom_dsp/arm/loopfilter_neon.c',
    '../../third_party/aom/aom_dsp/arm/sad4d_neon.c',
    '../../third_party/aom/aom_dsp/arm/sad_neon.c',
    '../../third_party/aom/aom_dsp/arm/save_reg_neon.asm',
@ -574,8 +591,8 @@ files = {
    '../../third_party/aom/av1/common/av1_rtcd.c',
    '../../third_party/aom/av1/common/blockd.c',
    '../../third_party/aom/av1/common/cdef.c',
-    '../../third_party/aom/av1/common/clpf.c',
-    '../../third_party/aom/av1/common/clpf_neon.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_neon.c',
    '../../third_party/aom/av1/common/convolve.c',
    '../../third_party/aom/av1/common/daala_tx.c',
    '../../third_party/aom/av1/common/debugmodes.c',
@ -586,14 +603,13 @@ files = {
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
-    '../../third_party/aom/av1/common/od_dering.c',
-    '../../third_party/aom/av1/common/od_dering_neon.c',
    '../../third_party/aom/av1/common/odintrin.c',
    '../../third_party/aom/av1/common/pred_common.c',
    '../../third_party/aom/av1/common/quant_common.c',
    '../../third_party/aom/av1/common/reconinter.c',
    '../../third_party/aom/av1/common/reconintra.c',
    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
    '../../third_party/aom/av1/common/scale.c',
    '../../third_party/aom/av1/common/scan.c',
    '../../third_party/aom/av1/common/seg_common.c',
@ -625,6 +641,7 @@ files = {
    '../../third_party/aom/av1/encoder/extend.c',
    '../../third_party/aom/av1/encoder/firstpass.c',
    '../../third_party/aom/av1/encoder/global_motion.c',
+    '../../third_party/aom/av1/encoder/hash.c',
    '../../third_party/aom/av1/encoder/hybrid_fwd_txfm.c',
    '../../third_party/aom/av1/encoder/lookahead.c',
    '../../third_party/aom/av1/encoder/mbgraph.c',
@ -632,6 +649,7 @@ files = {
    '../../third_party/aom/av1/encoder/palette.c',
    '../../third_party/aom/av1/encoder/pickcdef.c',
    '../../third_party/aom/av1/encoder/picklpf.c',
+    '../../third_party/aom/av1/encoder/pickrst.c',
    '../../third_party/aom/av1/encoder/ransac.c',
    '../../third_party/aom/av1/encoder/ratectrl.c',
    '../../third_party/aom/av1/encoder/rd.c',
@ -715,7 +733,7 @@ files = {
    '../../third_party/aom/av1/common/av1_rtcd.c',
    '../../third_party/aom/av1/common/blockd.c',
    '../../third_party/aom/av1/common/cdef.c',
-    '../../third_party/aom/av1/common/clpf.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
    '../../third_party/aom/av1/common/convolve.c',
    '../../third_party/aom/av1/common/daala_tx.c',
    '../../third_party/aom/av1/common/debugmodes.c',
@ -726,13 +744,13 @@ files = {
    '../../third_party/aom/av1/common/frame_buffers.c',
    '../../third_party/aom/av1/common/idct.c',
    '../../third_party/aom/av1/common/mvref_common.c',
-    '../../third_party/aom/av1/common/od_dering.c',
    '../../third_party/aom/av1/common/odintrin.c',
    '../../third_party/aom/av1/common/pred_common.c',
    '../../third_party/aom/av1/common/quant_common.c',
    '../../third_party/aom/av1/common/reconinter.c',
    '../../third_party/aom/av1/common/reconintra.c',
    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
    '../../third_party/aom/av1/common/scale.c',
    '../../third_party/aom/av1/common/scan.c',
    '../../third_party/aom/av1/common/seg_common.c',
@ -763,6 +781,7 @@ files = {
    '../../third_party/aom/av1/encoder/extend.c',
    '../../third_party/aom/av1/encoder/firstpass.c',
    '../../third_party/aom/av1/encoder/global_motion.c',
+    '../../third_party/aom/av1/encoder/hash.c',
    '../../third_party/aom/av1/encoder/hybrid_fwd_txfm.c',
    '../../third_party/aom/av1/encoder/lookahead.c',
    '../../third_party/aom/av1/encoder/mbgraph.c',
@ -770,6 +789,7 @@ files = {
    '../../third_party/aom/av1/encoder/palette.c',
    '../../third_party/aom/av1/encoder/pickcdef.c',
    '../../third_party/aom/av1/encoder/picklpf.c',
+    '../../third_party/aom/av1/encoder/pickrst.c',
    '../../third_party/aom/av1/encoder/ransac.c',
    '../../third_party/aom/av1/encoder/ratectrl.c',
    '../../third_party/aom/av1/encoder/rd.c',
--- a/third_party/aom/.clang-format
+++ b/third_party/aom/.clang-format
@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 3.9.1
+# Generated with clang-format 4.0.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
@ -60,6 +60,8 @@ IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
@ -78,6 +80,7 @@ PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
 SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@ -12,17 +12,22 @@ cmake_minimum_required(VERSION 3.5)

 if (NOT EMSCRIPTEN)
  if (NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
+    set(CMAKE_BUILD_TYPE "Release" CACHE
      "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" STRING FORCE)
  endif ()
 endif ()

+option(ENABLE_ADOPTED_EXPERIMENTS "Enable adopted experiments." ON)
 option(ENABLE_CCACHE "Enable ccache support." OFF)
 option(ENABLE_DISTCC "Enable distcc support." OFF)
 option(ENABLE_DOCS "Enable documentation generation (doxygen required)." ON)
-option(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+option(ENABLE_EXAMPLES "Enables build of example code." ON)
+option(ENABLE_GOMA "Enable goma support." OFF)
 option(ENABLE_IDE_TEST_HOSTING
       "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
+option(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+option(ENABLE_TOOLS "Enable applications in tools sub directory." ON)
+option(ENABLE_WERROR "Converts warnings to errors at compile time." OFF)

 # $BUILD_SHARED_LIBS is a CMake built-in-- it's listed here for visibility.
 option(BUILD_SHARED_LIBS "CMake should generate a shared library build." OFF)
@ -47,6 +52,7 @@ include("${AOM_ROOT}/aom_scale/aom_scale.cmake")
 include("${AOM_ROOT}/aom_util/aom_util.cmake")
 include("${AOM_ROOT}/av1/av1.cmake")
 include("${AOM_ROOT}/test/test.cmake")
+include("${AOM_ROOT}/build/cmake/sanitizers.cmake")
 include("${AOM_ROOT}/build/cmake/util.cmake")

 set(AOM_RTCD_SOURCES
@ -160,6 +166,10 @@ set(AOM_ENCODER_STATS_SOURCES
    "${AOM_ROOT}/rate_hist.c"
    "${AOM_ROOT}/rate_hist.h")

+set(AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")
+
+set(AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/aom_version.h")
+
 set(AOM_WEBM_DECODER_SOURCES
    "${AOM_ROOT}/webmdec.cc"
    "${AOM_ROOT}/webmdec.h")
@ -171,6 +181,48 @@ set(AOM_WEBM_ENCODER_SOURCES
 include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})

 # Targets
+add_library(aom_version ${AOM_VERSION_SOURCES})
+add_dummy_source_file_to_target(aom_version c)
+add_custom_command(
+  OUTPUT "${AOM_CONFIG_DIR}/aom_version.h"
+  COMMAND ${CMAKE_COMMAND}
+  ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+    -DAOM_ROOT=${AOM_ROOT}
+    -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+    -DPERL_EXECUTABLE=${PERL_EXECUTABLE}
+    -P "${AOM_ROOT}/build/cmake/version.cmake"
+  COMMENT "Writing aom_version.h"
+  VERBATIM)
+
+add_custom_target(aom_version_check
+  COMMAND ${CMAKE_COMMAND}
+    -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+    -DAOM_ROOT=${AOM_ROOT}
+    -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+    -DPERL_EXECUTABLE=${PERL_EXECUTABLE}
+    -P "${AOM_ROOT}/build/cmake/version.cmake"
+  COMMENT "Updating version info if necessary."
+  VERBATIM)
+add_dependencies(aom_version aom_version_check)
+
+if (NOT MSVC)
+  add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
+  add_dummy_source_file_to_target(aom_pc c)
+  add_custom_command(
+    OUTPUT "${AOM_CONFIG_DIR}/aom.pc"
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+      -DAOM_ROOT=${AOM_ROOT}
+      -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+      -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+      -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+      -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
+      -P "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+      COMMENT "Writing aom.pc"
+      VERBATIM)
+  add_dependencies(aom_pc aom_version)
+endif ()
+
 # TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
 # source.
 add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
@ -187,9 +239,15 @@ add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
                    "av1_rtcd")

 add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
+add_dependencies(aom_rtcd aom_version)
+
 add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
 add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)

+if (NOT MSVC AND NOT APPLE)
+  target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
+endif ()
+
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_encoder_stats aom_mem
    aom_scale aom)
@ -209,29 +267,30 @@ foreach (aom_lib ${AOM_LIB_TARGETS})
  endif ()
 endforeach ()

+# Generate a stub file containing the C function usage_exit(). Users of the
+# aom_common_app_util library must define this function. This is a convenience
+# to allow omission of the function from applications that might want to use
+# other pieces of the util support without defining the usage_exit().
+file(WRITE "${AOM_CONFIG_DIR}/usage_exit.c" "void usage_exit(void) {}")
+
 #
 # Application and application support targets.
 #
-add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+if (CONFIG_UNIT_TESTS OR ENABLE_EXAMPLES OR ENABLE_TOOLS)
+  add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  if (CONFIG_AV1_DECODER)
+    add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+  endif ()
+  if (CONFIG_AV1_ENCODER)
+    add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+  endif ()
+endif ()

-
-if (CONFIG_AV1_DECODER)
-  add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+if (CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
  add_executable(aomdec
                 "${AOM_ROOT}/aomdec.c"
                 $<TARGET_OBJECTS:aom_common_app_util>
                 $<TARGET_OBJECTS:aom_decoder_app_util>)
-
-  if (CONFIG_ANALYZER)
-    add_executable(analyzer
-                   "${AOM_ROOT}/examples/analyzer.cc"
-                   $<TARGET_OBJECTS:aom_common_app_util>
-                   $<TARGET_OBJECTS:aom_decoder_app_util>)
-    target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
-    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} analyzer)
-    set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} analyzer)
-  endif ()
-
  add_executable(decode_to_md5
                 "${AOM_ROOT}/examples/decode_to_md5.c"
                 $<TARGET_OBJECTS:aom_common_app_util>
@ -245,6 +304,17 @@ if (CONFIG_AV1_DECODER)
                 $<TARGET_OBJECTS:aom_common_app_util>
                 $<TARGET_OBJECTS:aom_decoder_app_util>)

+  if (CONFIG_ANALYZER)
+    add_executable(analyzer
+                   "${AOM_ROOT}/examples/analyzer.cc"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+    target_link_libraries(analyzer
+                          ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
+    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} analyzer)
+    set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} analyzer)
+  endif ()
+
  if (CONFIG_INSPECTION)
    add_executable(inspect
                   "${AOM_ROOT}/examples/inspect.c"
@ -269,50 +339,81 @@ if (CONFIG_AV1_DECODER)
    endif ()
  endif ()

-  # Maintain lists of example and app targets.
+  # Maintain a list of decoder example targets.
  set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
-      decode_to_md5 decode_with_drops simple_decoder)
-  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} aomdec ${AOM_DECODER_EXAMPLE_TARGETS})
-endif ()
+      aomdec decode_to_md5 decode_with_drops simple_decoder)

+  # Add decoder examples to the app targets list.
+  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_DECODER_EXAMPLE_TARGETS})
+endif ()

 if (CONFIG_AV1_ENCODER)
-  add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
-  add_executable(aomenc
-                 "${AOM_ROOT}/aomenc.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_stats>)
-  add_executable(lossless_encoder
-                 "${AOM_ROOT}/examples/lossless_encoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
-  add_executable(set_maps
-                 "${AOM_ROOT}/examples/set_maps.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
-  add_executable(simple_encoder
-                 "${AOM_ROOT}/examples/simple_encoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
-  add_executable(twopass_encoder
-                 "${AOM_ROOT}/examples/twopass_encoder.c"
-                 $<TARGET_OBJECTS:aom_common_app_util>
-                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+  if (ENABLE_EXAMPLES)
+    add_executable(aomenc
+                   "${AOM_ROOT}/aomenc.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_stats>)
+    add_executable(lossless_encoder
+                   "${AOM_ROOT}/examples/lossless_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(set_maps
+                   "${AOM_ROOT}/examples/set_maps.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(simple_encoder
+                   "${AOM_ROOT}/examples/simple_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(twopass_encoder
+                   "${AOM_ROOT}/examples/twopass_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)

-  # Add encoder apps and examples to target lists.
-  set(AOM_ENCODER_EXAMPLE_TARGETS
-      lossless_encoder set_maps simple_encoder twopass_encoder)
-  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} aomenc ${AOM_ENCODER_EXAMPLE_TARGETS})
+    # Maintain a list of encoder example targets.
+    set(AOM_ENCODER_EXAMPLE_TARGETS
+        aomenc lossless_encoder set_maps simple_encoder twopass_encoder)
+
+    # Add encoder examples to app target list.
+    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS})
+  endif ()
+
+  if (ENABLE_TOOLS AND CONFIG_ENTROPY_STATS)
+    # TODO(tomfinegan): Sort out why a simple link command with
+    # aom_entropy_optimizer.c won't work on macos, but dragging in all the
+    # helper machinery allows the link to succeed.
+    add_executable(aom_entropy_optimizer
+                   "${AOM_CONFIG_DIR}/usage_exit.c"
+                   "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    # Maintain a list of encoder tool targets.
+    set(AOM_ENCODER_TOOL_TARGETS
+        ${AOM_ENCODER_TOOL_TARGETS} aom_entropy_optimizer)
+
+      # Add encoder tools to app target list.
+    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_ENCODER_TOOL_TARGETS})
+  endif ()
 endif ()

-# Maintain a separate variable listing only the examples to facilitate
-# installation of example programs into an examples sub directory of
-# $AOM_DIST_DIR/bin when building the dist target.
-set(AOM_EXAMPLE_TARGETS
-    ${AOM_DECODER_EXAMPLE_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS})
+if (ENABLE_EXAMPLES)
+  # Maintain a separate variable listing only the examples to facilitate
+  # installation of example programs into an examples sub directory of
+  # $AOM_DIST_DIR/bin when building the dist target.
+  set(AOM_EXAMPLE_TARGETS
+      ${AOM_DECODER_EXAMPLE_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS})
+endif ()

-if (CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+if (ENABLE_TOOLS)
+  # Maintain a separate variable listing only the examples to facilitate
+  # installation of example programs into an tools sub directory of
+  # $AOM_DIST_DIR/bin when building the dist target.
+  set(AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS} ${AOM_ENCODER_TOOL_TARGETS})
+endif ()
+
+if (ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
  add_executable(aom_cx_set_ref
                 "${AOM_ROOT}/examples/aom_cx_set_ref.c"
                 $<TARGET_OBJECTS:aom_common_app_util>
@ -325,41 +426,45 @@ foreach (aom_app ${AOM_APP_TARGETS})
  target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom)
 endforeach ()

-if (CONFIG_LIBYUV)
-  add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
-  if (NOT MSVC)
-    target_compile_options(yuv PRIVATE -Wno-unused-parameter)
-  endif ()
-  include_directories("${AOM_ROOT}/third_party/libyuv/include")
+if (CONFIG_UNIT_TESTS OR ENABLE_EXAMPLES OR ENABLE_TOOLS)
+  if (CONFIG_LIBYUV)
+    add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
+    if (NOT MSVC)
+      target_compile_options(yuv PRIVATE -Wno-unused-parameter)
+    endif ()
+    include_directories("${AOM_ROOT}/third_party/libyuv/include")

-  # Add to existing targets.
-  foreach (aom_app ${AOM_APP_TARGETS})
-    target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
-    set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
-  endforeach ()
-endif ()
-
-if (CONFIG_WEBM_IO)
-  add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
-  include_directories("${AOM_ROOT}/third_party/libwebm")
-
-  if (NOT MSVC)
-    target_compile_options(webm PRIVATE -Wno-shadow)
+    # Add to existing targets.
+    foreach (aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+    endforeach ()
  endif ()

-  # Add to existing targets.
-  if (CONFIG_AV1_DECODER)
-    target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
-  endif ()
+  if (CONFIG_WEBM_IO)
+    add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
+    include_directories("${AOM_ROOT}/third_party/libwebm")
+    target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS)
+    target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS)

-  if (CONFIG_AV1_ENCODER)
-    target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
-  endif ()
+    if (NOT MSVC)
+      target_compile_options(webm PRIVATE -Wno-shadow)
+    endif ()

-  foreach (aom_app ${AOM_APP_TARGETS})
-    target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
-    set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
-   endforeach ()
+    # Add to existing targets.
+    if (CONFIG_AV1_DECODER)
+      target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
+    endif ()
+
+    if (CONFIG_AV1_ENCODER)
+      target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
+    endif ()
+
+    foreach (aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+     endforeach ()
+  endif ()
 endif ()

 if (CONFIG_UNIT_TESTS)
@ -390,12 +495,25 @@ if (XCODE)
  endif ()
 endif ()

-if ("${CMAKE_GENERATOR}" MATCHES "Makefiles$" )
+if (ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
  # Users of the configure build expect the example targets to be built in the
  # examples sub directory of the configured build directory after running make.
  file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples")
-  set_target_properties(${AOM_EXAMPLE_TARGETS} PROPERTIES
-                        RUNTIME_OUTPUT_DIRECTORY "${AOM_CONFIG_DIR}/examples")
+
+  foreach (target ${AOM_EXAMPLE_TARGETS})
+    if (NOT "${target}" MATCHES "aomdec\|aomenc")
+      set_target_properties(${target} PROPERTIES
+                            RUNTIME_OUTPUT_DIRECTORY
+                            "${AOM_CONFIG_DIR}/examples")
+    endif ()
+  endforeach ()
+
+  if (ENABLE_TOOLS AND AOM_TOOL_TARGETS)
+    # The same expectation is true for tool targets.
+    file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools")
+    set_target_properties(${AOM_TOOL_TARGETS} PROPERTIES
+                          RUNTIME_OUTPUT_DIRECTORY "${AOM_CONFIG_DIR}/tools")
+  endif ()
 endif ()

 if (BUILD_SHARED_LIBS)
@ -404,6 +522,9 @@ if (BUILD_SHARED_LIBS)
  set_target_properties(aom PROPERTIES SOVERSION 0)
 endif ()

+# Handle user supplied compile and link flags last to ensure they're obeyed.
+set_user_flags()
+
 # Aomedia documentation rule.
 if (ENABLE_DOCS)
  include(FindDoxygen)
@ -426,7 +547,10 @@ set(AOM_INSTALL_INCS
    "${AOM_ROOT}/aom/aom.h")

 if (CONFIG_AV1_DECODER)
-  set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomdec)
+  if (ENABLE_EXAMPLES)
+    set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomdec)
+  endif ()
+
  set(AOM_INSTALL_INCS
      ${AOM_INSTALL_INCS}
      "${AOM_ROOT}/aom/aom_decoder.h"
@ -434,11 +558,14 @@ if (CONFIG_AV1_DECODER)
 endif ()

 if (CONFIG_AV1_ENCODER)
+  if (ENABLE_EXAMPLES)
+    set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomenc)
+  endif ()
+
  set(AOM_INSTALL_INCS
      ${AOM_INSTALL_INCS}
      "${AOM_ROOT}/aom/aomcx.h"
      "${AOM_ROOT}/aom/aom_encoder.h")
-  set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomenc)
 endif ()

 set(AOM_INSTALL_LIBS aom)
@ -448,19 +575,30 @@ install(FILES ${AOM_INSTALL_INCS}
 install(FILES "${AOM_CONFIG_DIR}/aom.pc"
        DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
 install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION "${CMAKE_INSTALL_PREFIX}/lib")
-install(TARGETS ${AOM_INSTALL_BINS} DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+
+if (ENABLE_EXAMPLES)
+  install(TARGETS ${AOM_INSTALL_BINS} DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+endif ()

 # Aomedia dist rule.
-if (CONFIG_AV1_DECODER)
+if (CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
  set(AOM_DIST_APPS ${AOM_DIST_APPS} $<TARGET_FILE:aomdec>)
 endif ()
-if (CONFIG_AV1_ENCODER)
+if (CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
  set(AOM_DIST_APPS ${AOM_DIST_APPS} $<TARGET_FILE:aomenc>)
 endif ()

-foreach (example ${AOM_EXAMPLE_TARGETS})
-  list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
-endforeach ()
+if (ENABLE_EXAMPLES)
+  foreach (example ${AOM_EXAMPLE_TARGETS})
+    list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+  endforeach ()
+endif ()
+
+if (ENABLE_TOOLS)
+  foreach (tool ${AOM_TOOL_TARGETS})
+    list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+  endforeach ()
+endif ()

 if (NOT AOM_DIST_DIR)
  set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist")
@ -473,12 +611,14 @@ add_custom_target(dist
                  -DAOM_DIST_DIR=${AOM_DIST_DIR}
                  -DAOM_DIST_APPS="${AOM_DIST_APPS}"
                  -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
+                  -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
                  -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
                  -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
                  -DENABLE_DOCS=${ENABLE_DOCS}
                  -P "${AOM_ROOT}/build/cmake/dist.cmake"
                  DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS}
-                  ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS})
+                  ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
+                  ${AOM_TOOL_TARGETS})

 if (ENABLE_DOCS)
  add_dependencies(dist docs)
--- a/third_party/aom/README.md
+++ b/third_party/aom/README.md
@ -63,6 +63,35 @@ CMake built in variable `BUILD_SHARED_LIBS`:

 This is currently only supported on non-Windows targets.

+### Debugging
+
+Depending on the generator used there are multiple ways of going about
+debugging AV1 components. For single configuration generators like the Unix
+Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient:
+
+~~~
+    $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug
+~~~
+
+For Xcode, mainly because configuration controls for Xcode builds are buried two
+configuration windows deep and must be set for each subproject within the Xcode
+IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug:
+
+~~~
+    $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug
+~~~
+
+For Visual Studio the in-IDE configuration controls should be used. Simply set
+the IDE project configuration to Debug to allow for stepping through the code.
+
+In addition to the above it can sometimes be useful to debug only C and C++
+code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to
+generic at generation time:
+
+~~~
+    $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+~~~
+
 ### Cross compiling

 For the purposes of building the AV1 codec and applications and relative to the
@ -81,7 +110,9 @@ The toolchain files available at the time of this writing are:
 - x86-ios-simulator.cmake
 - x86-linux.cmake
 - x86-macos.cmake
+ - x86-mingw-gcc.cmake
 - x86\_64-ios-simulator.cmake
+ - x86\_64-mingw-gcc.cmake

 The following example demonstrates use of the x86-macos.cmake toolchain file on
 a x86\_64 MacOS host:
@ -109,6 +140,20 @@ In addition to the above it's important to note that the toolchain files
 suffixed with gcc behave differently than the others. These toolchain files
 attempt to obey the $CROSS environment variable.

+### Sanitizers
+
+Sanitizer integration is built-in to the CMake build system. To enable a
+sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
+enable address sanitizer:
+
+~~~
+    $ cmake path/to/aom -DSANITIZE=address
+    $ make
+~~~
+
+Sanitizers available vary by platform, target, and compiler. Consult your
+compiler documentation to determine which, if any, are available.
+
 ### Microsoft Visual Studio builds

 Building the AV1 codec library in Microsoft Visual Studio is supported. The
@ -249,11 +294,8 @@ test jobs. Sharded test runs can be achieved in a couple of ways.
   # Set the environment variable GTEST_TOTAL_SHARDS to 9 to run 10 test shards
   # (GTEST shard indexing is 0 based).
   $ export GTEST_TOTAL_SHARDS=9
-   $ for shard in $(seq 0 ${GTEST_TOTAL_SHARDS}); do \
-       [ ${shard} -lt ${GTEST_TOTAL_SHARDS} ] \
-         && GTEST_SHARD_INDEX=${shard} ./test_libaom & \
-     done
-
+   $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \
+       | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom
 ~~~

 To create a test shard for each CPU core available on the current system set
--- a/third_party/aom/aom/aom.h
+++ b/third_party/aom/aom/aom.h
@ -45,9 +45,7 @@ extern "C" {
 enum aom_com_control_id {
  /*!\brief pass in an external frame into decoder to be used as reference frame
   */
-  AOM_SET_REFERENCE = 1,
-  AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
-  AOM_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
+  AOM_SET_POSTPROC = 3, /**< set the decoder's post processing settings  */
  AOM_SET_DBG_COLOR_REF_FRAME =
      4, /**< set the reference frames to color for each macroblock */
  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
@ -59,6 +57,9 @@ enum aom_com_control_id {
   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
   */
  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
+  AV1_SET_REFERENCE = 129, /**< write a frame into a reference buffer */
+  AV1_COPY_REFERENCE =
+      130, /**< get a copy of reference frame from the decoder */
  AOM_COMMON_CTRL_ID_MAX,

  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
@ -98,25 +99,6 @@ typedef struct aom_postproc_cfg {
  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
 } aom_postproc_cfg_t;

-/*!\brief reference frame type
- *
- * The set of macros define the type of AOM reference frames
- */
-typedef enum aom_ref_frame_type {
-  AOM_LAST_FRAME = 1,
-  AOM_GOLD_FRAME = 2,
-  AOM_ALTR_FRAME = 4
-} aom_ref_frame_type_t;
-
-/*!\brief reference frame data struct
- *
- * Define the data struct to access aom reference frames.
- */
-typedef struct aom_ref_frame {
-  aom_ref_frame_type_t frame_type; /**< which reference frame */
-  aom_image_t img;                 /**< reference frame data in image format */
-} aom_ref_frame_t;
-
 /*!\brief AV1 specific reference frame data struct
 *
 * Define the data struct to access av1 reference frames.
@ -131,10 +113,6 @@ typedef struct av1_ref_frame {
 *
 * defines the data type for each of AOM decoder control function requires
 */
-AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
-#define AOM_CTRL_AOM_SET_REFERENCE
-AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
-#define AOM_CTRL_AOM_COPY_REFERENCE
 AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
 #define AOM_CTRL_AOM_SET_POSTPROC
 AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
@ -147,6 +125,10 @@ AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
 #define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
 AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_GET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_SET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_COPY_REFERENCE
 AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
 #define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE

--- a/third_party/aom/aom/aom_decoder.h
+++ b/third_party/aom/aom/aom_decoder.h
@ -55,8 +55,6 @@ extern "C" {
 #define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
 #define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
 #define AOM_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
-/*!\brief Can conceal errors due to packet loss */
-#define AOM_CODEC_CAP_ERROR_CONCEALMENT 0x80000
 /*!\brief Can receive encoded frames one fragment at a time */
 #define AOM_CODEC_CAP_INPUT_FRAGMENTS 0x100000

@ -73,8 +71,6 @@ extern "C" {
 #define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000

 #define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
-/*!\brief Conceal errors in decoded frames */
-#define AOM_CODEC_USE_ERROR_CONCEALMENT 0x20000
 /*!\brief The input frame should be passed to the decoder one fragment at a
 * time */
 #define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
--- a/third_party/aom/aom/aom_encoder.h
+++ b/third_party/aom/aom/aom_encoder.h
@ -372,21 +372,21 @@ typedef struct aom_codec_enc_cfg {
   */
  unsigned int rc_resize_mode;

-  /*!\brief Frame resize numerator.
+  /*!\brief Frame resize denominator.
   *
-   * The numerator for resize to use, assuming 16 as the denominator.
+   * The denominator for resize to use, assuming 8 as the numerator.
   *
-   * Valid numerators are  8 - 16 for now.
+   * Valid denominators are  8 - 16 for now.
   */
-  unsigned int rc_resize_numerator;
+  unsigned int rc_resize_denominator;

-  /*!\brief Keyframe resize numerator.
+  /*!\brief Keyframe resize denominator.
   *
-   * The numerator for resize to use, assuming 16 as the denominator.
+   * The denominator for resize to use, assuming 8 as the numerator.
   *
-   * Valid numerators are  8 - 16 for now.
+   * Valid denominators are  8 - 16 for now.
   */
-  unsigned int rc_resize_kf_numerator;
+  unsigned int rc_resize_kf_denominator;

  /*!\brief Frame super-resolution scaling mode.
   *
@ -394,32 +394,50 @@ typedef struct aom_codec_enc_cfg {
   * upscaling after the encode/decode process. Taking control of upscaling and
   * using restoration filters should allow it to outperform normal resizing.
   *
-   * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, and mode 2 is
-   * SUPERRES_DYNAMIC.
+   * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, mode 2 is
+   * SUPERRES_RANDOM and mode 3 is SUPERRES_QTHRESH.
   */
  unsigned int rc_superres_mode;

-  /*!\brief Frame super-resolution numerator.
+  /*!\brief Frame super-resolution denominator.
   *
-   * The numerator for superres to use. If fixed it will only change if the
+   * The denominator for superres to use. If fixed it will only change if the
   * cumulative scale change over resizing and superres is greater than 1/2;
   * this forces superres to reduce scaling.
   *
-   * Valid numerators are 8 to 16.
+   * Valid denominators are 8 to 16.
   *
-   * Ignored by SUPERRES_DYNAMIC.
+   * Used only by SUPERRES_FIXED.
   */
-  unsigned int rc_superres_numerator;
+  unsigned int rc_superres_denominator;

-  /*!\brief Keyframe super-resolution numerator.
+  /*!\brief Keyframe super-resolution denominator.
   *
-   * The numerator for superres to use. If fixed it will only change if the
+   * The denominator for superres to use. If fixed it will only change if the
   * cumulative scale change over resizing and superres is greater than 1/2;
   * this forces superres to reduce scaling.
   *
-   * Valid numerators are 8 - 16 for now.
+   * Valid denominators are 8 - 16 for now.
   */
-  unsigned int rc_superres_kf_numerator;
+  unsigned int rc_superres_kf_denominator;
+
+  /*!\brief Frame super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_qthresh;
+
+  /*!\brief Keyframe super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used for key frames.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_kf_qthresh;

  /*!\brief Rate control algorithm to use.
   *
@ -601,6 +619,48 @@ typedef struct aom_codec_enc_cfg {
   * implies a large-scale tile coding.
   */
  unsigned int large_scale_tile;
+
+  /*!\brief Number of explicit tile widths specified
+   *
+   * This value indicates the number of tile widths specified
+   * A value of 0 implies no tile widths are specified.
+   * Tile widths are given in the array tile_widths[]
+   */
+  int tile_width_count;
+
+  /*!\brief Number of explicit tile heights specified
+   *
+   * This value indicates the number of tile heights specified
+   * A value of 0 implies no tile heights are specified.
+   * Tile heights are given in the array tile_heights[]
+   */
+  int tile_height_count;
+
+/*!\brief Maximum number of tile widths in tile widths array
+ *
+ * This define gives the maximum number of elements in the tile_widths array.
+ */
+#define MAX_TILE_WIDTHS 64  // maximum tile width array length
+
+  /*!\brief Array of specified tile widths
+   *
+   * This array specifies tile widths (and may be empty)
+   * The number of widths specified is given by tile_width_count
+   */
+  int tile_widths[MAX_TILE_WIDTHS];
+
+/*!\brief Maximum number of tile heights in tile heights array.
+ *
+ * This define gives the maximum number of elements in the tile_heights array.
+ */
+#define MAX_TILE_HEIGHTS 64  // maximum tile height array length
+
+  /*!\brief Array of specified tile heights
+   *
+   * This array specifies tile heights (and may be empty)
+   * The number of heights specified is given by tile_height_count
+   */
+  int tile_heights[MAX_TILE_HEIGHTS];
 } aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */

 /*!\brief Initialize an encoder instance
@ -616,7 +676,7 @@ typedef struct aom_codec_enc_cfg {
 *
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
- * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    cfg     Configuration to use, if known.
 * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    ver     ABI version number. Must be set to
 *                       AOM_ENCODER_ABI_VERSION
@ -646,7 +706,7 @@ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
 *
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
- * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    cfg     Configuration to use, if known.
 * \param[in]    num_enc Total number of encoders.
 * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    dsf     Pointer to down-sampling factors.
--- a/third_party/aom/aom/aom_image.h
+++ b/third_party/aom/aom/aom_image.h
@ -35,8 +35,6 @@ extern "C" {
 #define AOM_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
 #define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */

-#include "./aom_config.h"
-
 /*!\brief List of supported image formats */
 typedef enum aom_img_fmt {
  AOM_IMG_FMT_NONE,
@ -71,25 +69,19 @@ typedef enum aom_img_fmt {

 /*!\brief List of supported color spaces */
 typedef enum aom_color_space {
-  AOM_CS_UNKNOWN = 0,   /**< Unknown */
-  AOM_CS_BT_601 = 1,    /**< BT.601 */
-  AOM_CS_BT_709 = 2,    /**< BT.709 */
-  AOM_CS_SMPTE_170 = 3, /**< SMPTE.170 */
-  AOM_CS_SMPTE_240 = 4, /**< SMPTE.240 */
-#if CONFIG_COLORSPACE_HEADERS
+  AOM_CS_UNKNOWN = 0,     /**< Unknown */
+  AOM_CS_BT_601 = 1,      /**< BT.601 */
+  AOM_CS_BT_709 = 2,      /**< BT.709 */
+  AOM_CS_SMPTE_170 = 3,   /**< SMPTE.170 */
+  AOM_CS_SMPTE_240 = 4,   /**< SMPTE.240 */
  AOM_CS_BT_2020_NCL = 5, /**< BT.2020 non-constant luminance (BT.2100) */
  AOM_CS_BT_2020_CL = 6,  /**< BT.2020 constant luminance */
  AOM_CS_SRGB = 7,        /**< sRGB */
  AOM_CS_ICTCP = 8,       /**< ICtCp, ITU-R BT.2100 */
  AOM_CS_RESERVED = 9     /**< Values 9..31 are reserved */
-#else
-  AOM_CS_BT_2020 = 5,  /**< BT.2020 */
-  AOM_CS_RESERVED = 6, /**< Reserved */
-  AOM_CS_SRGB = 7      /**< sRGB */
-#endif
-} aom_color_space_t; /**< alias for enum aom_color_space */
+} aom_color_space_t;      /**< alias for enum aom_color_space */

-#if CONFIG_COLORSPACE_HEADERS
+/*!\brief List of supported transfer functions */
 typedef enum aom_transfer_function {
  AOM_TF_UNKNOWN = 0,      /**< Unknown */
  AOM_TF_BT_709 = 1,       /**< BT.709 */
@ -97,7 +89,6 @@ typedef enum aom_transfer_function {
  AOM_TF_HLG = 3,          /**< Hybrid Log-Gamma */
  AOM_TF_RESERVED = 4      /**< Values 4..31 are reserved */
 } aom_transfer_function_t; /**< alias for enum aom_transfer_function */
-#endif

 /*!\brief List of supported color range */
 typedef enum aom_color_range {
@ -105,7 +96,7 @@ typedef enum aom_color_range {
  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
 } aom_color_range_t;       /**< alias for enum aom_color_range */

-#if CONFIG_COLORSPACE_HEADERS
+/*!\brief List of chroma sample positions */
 typedef enum aom_chroma_sample_position {
  AOM_CSP_UNKNOWN = 0,          /**< Unknown */
  AOM_CSP_VERTICAL = 1,         /**< Horizontally co-located with luma(0, 0)*/
@ -113,17 +104,14 @@ typedef enum aom_chroma_sample_position {
  AOM_CSP_COLOCATED = 2,        /**< Co-located with luma(0, 0) sample */
  AOM_CSP_RESERVED = 3          /**< Reserved value */
 } aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
-#endif

 /**\brief Image Descriptor */
 typedef struct aom_image {
-  aom_img_fmt_t fmt;    /**< Image Format */
-  aom_color_space_t cs; /**< Color Space */
-#if CONFIG_COLORSPACE_HEADERS
+  aom_img_fmt_t fmt;                /**< Image Format */
+  aom_color_space_t cs;             /**< Color Space */
  aom_transfer_function_t tf;       /**< transfer function */
  aom_chroma_sample_position_t csp; /**< chroma sample position */
-#endif
-  aom_color_range_t range; /**< Color Range */
+  aom_color_range_t range;          /**< Color Range */

  /* Image storage dimensions */
  unsigned int w;         /**< Stored image width */
@ -252,6 +240,24 @@ void aom_img_flip(aom_image_t *img);
 */
 void aom_img_free(aom_image_t *img);

+/*!\brief Get the width of a plane
+ *
+ * Get the width of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_width(const aom_image_t *img, int plane);
+
+/*!\brief Get the height of a plane
+ *
+ * Get the height of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_height(const aom_image_t *img, int plane);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/third_party/aom/aom/aomcx.h
+++ b/third_party/aom/aom/aomcx.h
@ -341,7 +341,6 @@ enum aome_enc_control_id {
   */
  AV1E_SET_COLOR_SPACE,

-#if CONFIG_COLORSPACE_HEADERS
  /*!\brief Codec control function to set transfer function info.
   * \note Valid ranges: 0..4, default is "UNKNOWN".
   *                     0 = UNKNOWN,
@ -360,7 +359,6 @@ enum aome_enc_control_id {
   *                     3 = RESERVED
   */
  AV1E_SET_CHROMA_SAMPLE_POSITION,
-#endif

  /*!\brief Codec control function to set minimum interval between GF/ARF frames
   *
@ -458,6 +456,21 @@ enum aome_enc_control_id {
   */
  AV1E_SET_QM_MAX,

+  /*!\brief Codec control function to encode with dist_8x8.
+   *
+   *  The dist_8x8 is enabled automatically for model tuning parameters that
+   *  require measuring distortion at the 8x8 level. This control also allows
+   *  measuring distortion at the 8x8 level for other tuning options
+   *  (e.g., PSNR), for testing purposes.
+   *                          0 = do not use dist_8x8
+   *                          1 = use dist_8x8
+   *
+   *  By default, the encoder does not use dist_8x8
+   *
+   * Experiment: DIST_8X8
+   */
+  AV1E_SET_ENABLE_DIST_8X8,
+
  /*!\brief Codec control function to set a maximum number of tile groups.
   *
   * This will set the maximum number of tile groups. This will be
@ -567,24 +580,31 @@ typedef enum aom_scaling_mode_1d {
  AOME_ONETWO = 3
 } AOM_SCALING_MODE;

+/*!\brief Max number of segments
+ *
+ * This is the limit of number of segments allowed within a frame.
+ *
+ * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_SEGMENTS 8
+
 /*!\brief  aom region of interest map
 *
 * These defines the data structures for the region of interest map
 *
+ * TODO(yaowu): create a unit test for ROI map related APIs
+ *
 */
-
 typedef struct aom_roi_map {
-  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  /*! An id between 0 and 7 for each 8x8 region within a frame. */
  unsigned char *roi_map;
-  unsigned int rows; /**< Number of rows. */
-  unsigned int cols; /**< Number of columns. */
-  // TODO(paulwilkins): broken for AV1 which has 8 segments
-  // q and loop filter deltas for each segment
-  // (see MAX_MB_SEGMENTS)
-  int delta_q[4];  /**< Quantizer deltas. */
-  int delta_lf[4]; /**< Loop filter deltas. */
+  unsigned int rows;              /**< Number of rows. */
+  unsigned int cols;              /**< Number of columns. */
+  int delta_q[AOM_MAX_SEGMENTS];  /**< Quantizer deltas. */
+  int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */
  /*! Static breakout threshold for each segment. */
-  unsigned int static_threshold[4];
+  unsigned int static_threshold[AOM_MAX_SEGMENTS];
 } aom_roi_map_t;

 /*!\brief  aom active region map
@ -622,7 +642,14 @@ typedef enum {
 * Changes the encoder to tune for certain types of input material.
 *
 */
-typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
+typedef enum {
+  AOM_TUNE_PSNR,
+  AOM_TUNE_SSIM,
+#ifdef CONFIG_DIST_8X8
+  AOM_TUNE_CDEF_DIST,
+  AOM_TUNE_DAALA_DIST
+#endif
+} aom_tune_metric;

 /*!\cond */
 /*!\brief Encoder control function parameter type
@ -632,7 +659,7 @@ typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
 *
 */

-AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
+AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int)
 #define AOM_CTRL_AOME_USE_REFERENCE
 AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
 #define AOM_CTRL_AOME_SET_ROI_MAP
@ -693,6 +720,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_QM

+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8
+
 AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
 #define AOM_CTRL_AV1E_SET_QM_MIN

@ -728,13 +758,11 @@ AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
 AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
 #define AOM_CTRL_AV1E_SET_COLOR_SPACE

-#if CONFIG_COLORSPACE_HEADERS
 AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_FUNCTION, int)
 #define AOM_CTRL_AV1E_SET_TRANSFER_FUNCTION

 AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
 #define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
-#endif

 AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
 #define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
--- a/third_party/aom/aom/exports_com
+++ b/third_party/aom/aom/exports_com
@ -12,5 +12,7 @@ text aom_codec_version_str
 text aom_img_alloc
 text aom_img_flip
 text aom_img_free
+text aom_img_plane_width
+text aom_img_plane_height
 text aom_img_set_rect
 text aom_img_wrap
--- a/third_party/aom/aom/src/aom_decoder.c
+++ b/third_party/aom/aom/src/aom_decoder.c
@ -37,9 +37,6 @@ aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
    res = AOM_CODEC_INCAPABLE;
-  else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
-           !(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
-    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
           !(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
    res = AOM_CODEC_INCAPABLE;
--- a/third_party/aom/aom/src/aom_image.c
+++ b/third_party/aom/aom/src/aom_image.c
@ -238,3 +238,17 @@ void aom_img_free(aom_image_t *img) {
    if (img->self_allocd) free(img);
  }
 }
+
+int aom_img_plane_width(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->x_chroma_shift > 0)
+    return (img->d_w + 1) >> img->x_chroma_shift;
+  else
+    return img->d_w;
+}
+
+int aom_img_plane_height(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->y_chroma_shift > 0)
+    return (img->d_h + 1) >> img->y_chroma_shift;
+  else
+    return img->d_h;
+}
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@ -23,6 +23,7 @@ set(AOM_DSP_COMMON_SOURCES
    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
    "${AOM_ROOT}/aom_dsp/intrapred.c"
+    "${AOM_ROOT}/aom_dsp/intrapred_common.h"
    "${AOM_ROOT}/aom_dsp/loopfilter.c"
    "${AOM_ROOT}/aom_dsp/prob.c"
    "${AOM_ROOT}/aom_dsp/prob.h"
@ -45,7 +46,9 @@ set(AOM_DSP_COMMON_ASM_SSE2
 set(AOM_DSP_COMMON_INTRIN_SSE2
    "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
    "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
    "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+    "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
    "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")

 set(AOM_DSP_COMMON_ASM_SSSE3
@ -55,6 +58,7 @@ set(AOM_DSP_COMMON_ASM_SSSE3

 set(AOM_DSP_COMMON_INTRIN_SSSE3
    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+    "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")

 set(AOM_DSP_COMMON_INTRIN_SSE4_1
@ -64,16 +68,28 @@ set(AOM_DSP_COMMON_INTRIN_SSE4_1

 set(AOM_DSP_COMMON_INTRIN_AVX2
    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
+    "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
+    "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
    "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")

+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_AVX2
+      ${AOM_DSP_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
+endif ()
+
+if (NOT CONFIG_EXT_PARTITION)
+  set(AOM_DSP_COMMON_ASM_NEON
+      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
+      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
+      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm"
+      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm")
+endif ()
+
 set(AOM_DSP_COMMON_ASM_NEON
-    "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm"
+    ${AOM_DSP_COMMON_ASM_NEON}
    "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm"
    "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm"
    "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm"
@ -83,33 +99,53 @@ set(AOM_DSP_COMMON_ASM_NEON
    "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
    "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm"
    "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")

+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_ASM_NEON
+      ${AOM_DSP_COMMON_ASM_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm")
+endif ()
+
+if (NOT CONFIG_EXT_PARTITION)
+  set(AOM_DSP_COMMON_INTRIN_NEON
+      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c")
+endif ()
+
 set(AOM_DSP_COMMON_INTRIN_NEON
-    "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c"
+    ${AOM_DSP_COMMON_INTRIN_NEON}
    "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
    "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")

-if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_NEON
+      ${AOM_DSP_COMMON_INTRIN_NEON}
+      "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c")
+endif ()
+
+if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
+  if (NOT CONFIG_EXT_PARTITION)
+    set(AOM_DSP_COMMON_INTRIN_NEON
+        ${AOM_DSP_COMMON_INTRIN_NEON}
+        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c")
+  endif ()
+
  set(AOM_DSP_COMMON_INTRIN_NEON
      ${AOM_DSP_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
      "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c"
      "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c"
      "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c"
@ -118,10 +154,15 @@ if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
      "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
      "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
      "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c")
+
+  if (NOT CONFIG_PARALLEL_DEBLOCKING)
+    set(AOM_DSP_COMMON_INTRIN_NEON
+        ${AOM_DSP_COMMON_INTRIN_NEON}
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
+        "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
+  endif ()
 endif ()

 set(AOM_DSP_COMMON_INTRIN_DSPR2
@ -141,14 +182,19 @@ set(AOM_DSP_COMMON_INTRIN_DSPR2
    "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
    "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
    "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_DSPR2
+      ${AOM_DSP_COMMON_INTRIN_DSPR2}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
+endif ()

 set(AOM_DSP_COMMON_INTRIN_MSA
    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
@ -169,13 +215,18 @@ set(AOM_DSP_COMMON_INTRIN_MSA
    "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
    "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h"
    "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
    "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")

+if (NOT CONFIG_PARALLEL_DEBLOCKING)
+  set(AOM_DSP_COMMON_INTRIN_MSA
+      ${AOM_DSP_COMMON_INTRIN_MSA}
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
+      "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h")
+endif ()
+
 if (CONFIG_HIGHBITDEPTH)
  set(AOM_DSP_COMMON_ASM_SSE2
      ${AOM_DSP_COMMON_ASM_SSE2}
@ -185,11 +236,18 @@ if (CONFIG_HIGHBITDEPTH)

  set(AOM_DSP_COMMON_INTRIN_SSE2
      ${AOM_DSP_COMMON_INTRIN_SSE2}
+      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")

+  set(AOM_DSP_COMMON_INTRIN_SSSE3
+      ${AOM_DSP_COMMON_INTRIN_SSSE3}
+      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c")
+
  set(AOM_DSP_COMMON_INTRIN_AVX2
      ${AOM_DSP_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c")
+      "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
+      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
 else ()
  set(AOM_DSP_COMMON_INTRIN_DSPR2
      ${AOM_DSP_COMMON_INTRIN_DSPR2}
@ -332,12 +390,10 @@ if (CONFIG_AV1_ENCODER)
        "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
        "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")

-    if (CONFIG_EXT_INTER)
      set(AOM_DSP_ENCODER_INTRIN_SSSE3
          ${AOM_DSP_ENCODER_INTRIN_SSSE3}
          "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
          "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
-    endif ()

    if (CONFIG_HIGHBITDEPTH)
      set(AOM_DSP_ENCODER_INTRIN_SSE2
--- a/third_party/aom/aom_dsp/aom_dsp.mk
+++ b/third_party/aom/aom_dsp/aom_dsp.mk
@ -64,6 +64,7 @@ endif

 # intra predictions
 DSP_SRCS-yes += intrapred.c
+DSP_SRCS-yes += intrapred_common.h

 ifneq ($(CONFIG_ANS),yes)
 DSP_SRCS-yes += entcode.c
@ -75,9 +76,16 @@ DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm

+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c
+DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c
+
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
 endif  # CONFIG_HIGHBITDEPTH

 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
@ -120,6 +128,7 @@ DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
 endif
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm

+ifneq ($(CONFIG_EXT_PARTITION),yes)
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
 DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
@ -135,6 +144,7 @@ DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
 DSP_SRCS-yes += arm/aom_convolve_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
+endif  # CONFIG_EXT_PARTITION

 # common (msa)
 DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
@ -164,7 +174,10 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 DSP_SRCS-yes += loopfilter.c

 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
-DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_SSE2)                += x86/lpf_common_sse2.h
+
+ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/loopfilter_avx2.c

 DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 ifeq ($(HAVE_NEON_ASM),yes)
@ -191,13 +204,16 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
+endif  # !CONFIG_PARALLEL_DEBLOCKING

 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_loopfilter_avx2.c
 endif  # CONFIG_HIGHBITDEPTH

 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-yes            += x86/txfm_common_intrin.h
+DSP_SRCS-$(HAVE_AVX2)   += x86/common_avx2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/obmc_intrinsic_ssse3.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
@ -343,10 +359,8 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/sad_highbd_avx2.c
 endif

 ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
-endif  #CONFIG_EXT_INTER
 ifeq ($(CONFIG_MOTION_VAR),yes)
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@ -52,10 +52,9 @@ extern "C" {
 #define UNLIKELY(v) (v)
 #endif

-#if CONFIG_AOM_QM
 typedef uint16_t qm_val_t;
 #define AOM_QM_BITS 5
-#endif
+
 #if CONFIG_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
@ -78,6 +77,10 @@ static INLINE int clamp(int value, int low, int high) {
  return value < low ? low : (value > high ? high : value);
 }

+static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
 static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
  return value < low ? low : (value > high ? high : value);
 }
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@ -529,229 +529,4 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    }
  }
 }
-
-void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint16x8_t q1u16, q3u16;
-  int16x8_t q1s16;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d2u32 = vdup_n_u32(0);
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-  for (i = 0; i < 4; i++, dst += stride) {
-    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
-    d0u8 = vqmovun_s16(q1s16);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  }
-}
-
-void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint16x8_t q0u16, q3u16, q10u16;
-  int16x8_t q0s16;
-  uint16x4_t d20u16;
-  uint8x8_t d0u8, d2u8, d30u8;
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d30u8 = vld1_u8(left);
-  d2u8 = vld1_u8(above);
-  q10u16 = vmovl_u8(d30u8);
-  q3u16 = vsubl_u8(d2u8, d0u8);
-  d20u16 = vget_low_u16(q10u16);
-  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-  }
-}
-
-void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-  uint8x16_t q0u8, q1u8;
-  int16x8_t q0s16, q1s16, q8s16, q11s16;
-  uint16x4_t d20u16;
-  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  for (k = 0; k < 2; k++, left += 8) {
-    d18u8 = vld1_u8(left);
-    q10u16 = vmovl_u8(d18u8);
-    d20u16 = vget_low_u16(q10u16);
-    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-      q0u16 = vdupq_lane_u16(d20u16, 0);
-      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d20u16, 2);
-      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-    }
-  }
-}
-
-void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-  uint8x16_t q0u8, q1u8, q2u8;
-  int16x8_t q12s16, q13s16, q14s16, q15s16;
-  uint16x4_t d6u16;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u8 = vld1q_u8(above + 16);
-  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-  for (k = 0; k < 4; k++, left += 8) {
-    d26u8 = vld1_u8(left);
-    q3u16 = vmovl_u8(d26u8);
-    d6u16 = vget_low_u16(q3u16);
-    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-    }
-  }
-}
 #endif  // !HAVE_NEON_ASM
--- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
@ -19,10 +19,6 @@
    EXPORT  |aom_h_predictor_8x8_neon|
    EXPORT  |aom_h_predictor_16x16_neon|
    EXPORT  |aom_h_predictor_32x32_neon|
-    EXPORT  |aom_tm_predictor_4x4_neon|
-    EXPORT  |aom_tm_predictor_8x8_neon|
-    EXPORT  |aom_tm_predictor_16x16_neon|
-    EXPORT  |aom_tm_predictor_32x32_neon|
    ARM
    REQUIRE8
    PRESERVE8
@ -289,345 +285,3 @@ loop_h
    bgt                 loop_h
    bx                  lr
    ENDP                ; |aom_h_predictor_32x32_neon|
-
-;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_tm_predictor_4x4_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.u8             {d0[]}, [r12]
-
-    ; Load above 4 pixels
-    vld1.32             {d2[0]}, [r2]
-
-    ; Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; 1st row and 2nd row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]!
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-
-    ; 3rd row and 4th row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_tm_predictor_4x4_neon|
-
-;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_tm_predictor_8x8_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; preload 8 left
-    vld1.8              {d30}, [r3]
-
-    ; Load above 8 pixels
-    vld1.64             {d2}, [r2]
-
-    vmovl.u8            q10, d30
-
-    ; Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; 1st row and 2nd row
-    vdup.16             q0, d20[0]
-    vdup.16             q1, d20[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    ; 3rd row and 4th row
-    vdup.16             q8, d20[2]
-    vdup.16             q9, d20[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    ; 5th row and 6th row
-    vdup.16             q0, d21[0]
-    vdup.16             q1, d21[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    ; 7th row and 8th row
-    vdup.16             q8, d21[2]
-    vdup.16             q9, d21[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    bx                  lr
-    ENDP                ; |aom_tm_predictor_8x8_neon|
-
-;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_tm_predictor_16x16_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; Load above 8 pixels
-    vld1.8              {q1}, [r2]
-
-    ; preload 8 left into r12
-    vld1.8              {d18}, [r3]!
-
-    ; Compute above - ytop_left
-    vsubl.u8            q2, d2, d0
-    vsubl.u8            q3, d3, d0
-
-    vmovl.u8            q10, d18
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
-    mov                 r2, #2
-
-loop_16x16_neon
-    ; Process two rows.
-    vdup.16             q0, d20[0]
-    vdup.16             q8, d20[1]
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d20[2]                  ; proload next 2 rows data
-    vdup.16             q8, d20[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[0]                  ; proload next 2 rows data
-    vdup.16             q8, d21[1]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[2]                  ; proload next 2 rows data
-    vdup.16             q8, d21[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
-    vmovl.u8            q10, d18
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_16x16_neon
-
-    bx                  lr
-    ENDP                ; |aom_tm_predictor_16x16_neon|
-
-;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                  const uint8_t *above,
-;                                  const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_tm_predictor_32x32_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; Load above 32 pixels
-    vld1.8              {q1}, [r2]!
-    vld1.8              {q2}, [r2]
-
-    ; preload 8 left pixels
-    vld1.8              {d26}, [r3]!
-
-    ; Compute above - ytop_left
-    vsubl.u8            q8, d2, d0
-    vsubl.u8            q9, d3, d0
-    vsubl.u8            q10, d4, d0
-    vsubl.u8            q11, d5, d0
-
-    vmovl.u8            q3, d26
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
-    mov                 r2, #4
-
-loop_32x32_neon
-    ; Process two rows.
-    vdup.16             q0, d6[0]
-    vdup.16             q2, d6[1]
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q1, d6[2]
-    vdup.16             q2, d6[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q1, q8
-    vadd.s16            q13, q1, q9
-    vadd.s16            q14, q1, q10
-    vadd.s16            q15, q1, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[0]
-    vdup.16             q2, d7[1]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[2]
-    vdup.16             q2, d7[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vmovl.u8            q3, d0
-    vst1.64             {d24-d27}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_32x32_neon
-
-    bx                  lr
-    ENDP                ; |aom_tm_predictor_32x32_neon|
-
-    END
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@ -53,6 +53,15 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r,
  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
 }

+static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
+                                               uint16_t n) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n - 1) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
+}
+
 uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p,
                                        uint16_t ref ACCT_STR_PARAM) {
  if (n <= 1) return 0;
@ -101,15 +110,42 @@ uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
  return v;
 }

-// Decode finite subexponential code that for a symbol v in [0, n-1] with
-// parameter k
-// based on a reference ref also in [0, n-1].
+static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
+                                                uint16_t n, uint16_t k) {
+  int i = 0;
+  int mk = 0;
+  uint16_t v;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      v = aom_rb_read_primitive_quniform(rb, n - mk) + mk;
+      break;
+    } else {
+      if (aom_rb_read_bit(rb)) {
+        i = i + 1;
+        mk += a;
+      } else {
+        v = aom_rb_read_literal(rb, b) + mk;
+        break;
+      }
+    }
+  }
+  return v;
+}
+
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
                                          uint16_t ref ACCT_STR_PARAM) {
  return inv_recenter_finite_nonneg(
      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
 }

+static uint16_t aom_rb_read_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
+  return inv_recenter_finite_nonneg(n, ref,
+                                    aom_rb_read_primitive_subexpfin(rb, n, k));
+}
+
 // Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with
 // parameter k based on a reference ref also in [-(n-1), n-1].
 int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
@ -120,3 +156,10 @@ int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
  return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) -
         n + 1;
 }
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
+  ref += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
+}
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@ -17,9 +17,11 @@ extern "C" {
 #endif

 #include <assert.h>
+
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"

 #define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \
  aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
@ -47,6 +49,9 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
 int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
                                                uint16_t k,
                                                int16_t ref ACCT_STR_PARAM);
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@ -10,6 +10,7 @@
 */

 #include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_writer.h"

 #include "av1/common/common.h"

@ -68,6 +69,19 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
  }
 }

+static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+                                            uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n - 1) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_bit(wb, (v - m) & 1);
+  }
+}
+
 int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
  if (n <= 1) return 0;
  const int l = get_msb(n - 1) + 1;
@ -155,6 +169,31 @@ void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
  }
 }

+static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+                                             uint16_t n, uint16_t k,
+                                             uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      aom_wb_write_primitive_quniform(wb, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_wb_write_bit(wb, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_wb_write_literal(wb, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
 int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
  int count = 0;
  int i = 0;
@ -184,19 +223,34 @@ int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
 // based on a reference ref also in [0, n-1].
 // Recenters symbol around r first and then uses a finite subexponential code.
 void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
-                                      int16_t ref, int16_t v) {
+                                      uint16_t ref, uint16_t v) {
  aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
 }

+static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                uint16_t ref, uint16_t v) {
+  aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
 void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
-                                             uint16_t k, uint16_t ref,
-                                             uint16_t v) {
+                                             uint16_t k, int16_t ref,
+                                             int16_t v) {
  ref += n - 1;
  v += n - 1;
  const uint16_t scaled_n = (n << 1) - 1;
  aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
 }

+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+}
+
 int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
                                     uint16_t v) {
  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@ -20,6 +20,7 @@ extern "C" {
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitwriter.h"
+#include "aom_dsp/bitwriter_buffer.h"

 // Codes a symbol v in [-2^mag_bits, 2^mag_bits]
 // mag_bits is number of bits for magnitude. The alphabet is of size
@ -53,6 +54,10 @@ void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
                                             uint16_t k, int16_t ref,
                                             int16_t v);

+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v);
+
 // Functions that counts bits for the above primitives
 int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
 int aom_count_primitive_quniform(uint16_t n, uint16_t v);
--- a/third_party/aom/aom_dsp/bitreader.h
+++ b/third_party/aom/aom_dsp/bitreader.h
@ -50,6 +50,11 @@
 #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))

+#if CONFIG_LV_MAP
+#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -198,6 +203,16 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
  return ret;
 }

+#if CONFIG_LV_MAP
+static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf,
+                                int nsymbs ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+  update_cdf(cdf, ret, nsymbs);
+  return ret;
+}
+#endif
+
 static INLINE int aom_read_tree_as_cdf(aom_reader *r,
                                       const aom_tree_index *tree,
                                       const aom_prob *probs) {
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ b/third_party/aom/aom_dsp/bitwriter.h
@ -62,9 +62,8 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) {

 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
 #if CONFIG_ANS
-  (void)bc;
-  (void)buffer;
-  assert(0 && "buf_ans requires a more complicated startup procedure");
+  aom_buf_ans_alloc(bc, /* error context*/ NULL);
+  buf_ans_write_init(bc, buffer);
 #else
  aom_daala_start_encode(bc, buffer);
 #endif
@ -72,8 +71,8 @@ static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {

 static INLINE void aom_stop_encode(aom_writer *bc) {
 #if CONFIG_ANS
-  (void)bc;
-  assert(0 && "buf_ans requires a more complicated shutdown procedure");
+  aom_buf_ans_flush(bc);
+  bc->pos = buf_ans_write_end(bc);
 #else
  aom_daala_stop_encode(bc);
 #endif
@ -143,6 +142,14 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
  update_cdf(cdf, symb, nsymbs);
 }

+#if CONFIG_LV_MAP
+static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                 int nsymbs) {
+  aom_write_cdf(w, symb, cdf, nsymbs);
+  update_cdf(cdf, symb, nsymbs);
+}
+#endif
+
 static INLINE void aom_write_tree_as_cdf(aom_writer *w,
                                         const aom_tree_index *tree,
                                         const aom_prob *probs, int bits,
--- a/third_party/aom/aom_dsp/buf_ans.c
+++ b/third_party/aom/aom_dsp/buf_ans.c
@ -16,9 +16,8 @@
 #include "aom/internal/aom_codec_internal.h"

 void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error, int size) {
+                       struct aom_internal_error_info *error) {
  c->error = error;
-  c->size = size;
  assert(c->size > 1);
  AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
  // Initialize to overfull to trigger the assert in write.
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ b/third_party/aom/aom_dsp/buf_ans.h
@ -46,6 +46,7 @@ struct BufAnsCoder {
 #if ANS_MAX_SYMBOLS
  int window_size;
 #endif
+  int pos;  // Dummy variable to store the output buffer after closing
 };

 // Allocate a buffered ANS coder to store size symbols.
@ -54,7 +55,7 @@ struct BufAnsCoder {
 // When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
 // buffer will grow on demand
 void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error, int hint);
+                       struct aom_internal_error_info *error);

 void aom_buf_ans_free(struct BufAnsCoder *c);

--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@ -17,7 +17,7 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
  }
  r->buffer_end = buffer + size;
  r->buffer = buffer;
-  od_ec_dec_init(&r->ec, buffer, size - 1);
+  od_ec_dec_init(&r->ec, buffer, size);
 #if CONFIG_ACCOUNTING
  r->accounting = NULL;
 #endif
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@ -45,11 +45,7 @@ uint32_t aom_daala_reader_tell_frac(const daala_reader *r);

 static INLINE int aom_daala_read(daala_reader *r, int prob) {
  int bit;
-#if CONFIG_EC_SMALLMUL
  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#else
-  int p = ((prob << 15) + 256 - prob) >> 8;
-#endif
 #if CONFIG_BITSTREAM_DEBUG
 /*{
  const int queue_r = bitstream_queue_get_read();
@ -113,6 +109,7 @@ static INLINE int aom_daala_reader_has_error(daala_reader *r) {
 static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
                                    int nsymbs) {
  int symb;
+  assert(cdf != NULL);
  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);

 #if CONFIG_BITSTREAM_DEBUG
--- a/third_party/aom/aom_dsp/daalaboolwriter.c
+++ b/third_party/aom/aom_dsp/daalaboolwriter.c
@ -24,9 +24,5 @@ void aom_daala_stop_encode(daala_writer *br) {
  daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
  memcpy(br->buffer, daala_data, daala_bytes);
  br->pos = daala_bytes;
-  /* Prevent ec bitstream from being detected as a superframe marker.
-     Must always be added, so that rawbits knows the exact length of the
-      bitstream. */
-  br->buffer[br->pos++] = 0;
  od_ec_enc_clear(&br->ec);
 }
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@ -36,11 +36,7 @@ void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
 void aom_daala_stop_encode(daala_writer *w);

 static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
-#if CONFIG_EC_SMALLMUL
  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#else
-  int p = ((prob << 15) + 256 - prob) >> 8;
-#endif
 #if CONFIG_BITSTREAM_DEBUG
  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
  /*int queue_r = 0;
--- a/third_party/aom/aom_dsp/entcode.h
+++ b/third_party/aom/aom_dsp/entcode.h
@ -28,15 +28,11 @@ typedef uint32_t od_ec_window;
   3 => 1/8th bits.*/
 #define OD_BITRES (3)

-/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual
-   Q15 cumulative probability (an "inverse" CDF).
+/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative
+   probability (an "inverse" CDF).
  This function converts from one representation to the other (and is its own
   inverse).*/
-#if CONFIG_EC_SMALLMUL
 #define OD_ICDF(x) (32768U - (x))
-#else
-#define OD_ICDF(x) (x)
-#endif

 /*See entcode.c for further documentation.*/

--- a/third_party/aom/aom_dsp/entdec.c
+++ b/third_party/aom/aom_dsp/entdec.c
@ -114,12 +114,8 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
  OD_ASSERT(rng <= 65535U);
  d = 16 - OD_ILOG_NZ(rng);
  dec->cnt -= d;
-#if CONFIG_EC_SMALLMUL
  /*This is equivalent to shifting in 1's instead of 0's.*/
  dec->dif = ((dif + 1) << d) - 1;
-#else
-  dec->dif = dif << d;
-#endif
  dec->rng = rng << d;
  if (dec->cnt < 0) od_ec_dec_refill(dec);
  return ret;
@ -137,11 +133,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
  dec->end = buf + storage;
  dec->bptr = buf;
-#if CONFIG_EC_SMALLMUL
  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
-#else
-  dec->dif = 0;
-#endif
  dec->rng = 0x8000;
  dec->cnt = -15;
  dec->error = 0;
@ -149,8 +141,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
 }

 /*Decode a single binary value.
-  {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768.
-  {else} f: The probability that the bit is zero, scaled by 32768.
+  f: The probability that the bit is one, scaled by 32768.
  Return: The value decoded (0 or 1).*/
 int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
  od_ec_window dif;
@ -165,7 +156,6 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
  r = dec->rng;
  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
  OD_ASSERT(32768U <= r);
-#if CONFIG_EC_SMALLMUL
  v = (r >> 8) * (uint32_t)f >> 7;
  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
  ret = 1;
@ -175,30 +165,19 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
    dif -= vw;
    ret = 0;
  }
-#else
-  v = f * (uint32_t)r >> 15;
-  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = 0;
-  r_new = v;
-  if (dif >= vw) {
-    r_new = r - v;
-    dif -= vw;
-    ret = 1;
-  }
-#endif
  return od_ec_dec_normalize(dec, dif, r_new, ret);
 }

-/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-increasing, and cdf[nsyms - 1]
-        must be 32768.
-       {EC_SMALLMUL}: The CDF contains 32768 minus those values.
+/*Decodes a symbol given an inverse cumulative distribution function (CDF)
+   table in Q15.
+  icdf: 32768 minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+        The values must be monotonically non-increasing, and icdf[nsyms - 1]
+         must be 0.
  nsyms: The number of symbols in the alphabet.
         This should be at most 16.
  Return: The decoded symbol s.*/
-int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
  od_ec_window dif;
  unsigned r;
  unsigned c;
@ -209,33 +188,19 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
  dif = dec->dif;
  r = dec->rng;
  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U));
+  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
  OD_ASSERT(32768U <= r);
-#if CONFIG_EC_SMALLMUL
  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
  v = r;
  ret = -1;
  do {
    u = v;
-    v = (r >> 8) * (uint32_t)cdf[++ret] >> 7;
+    v = (r >> 8) * (uint32_t)icdf[++ret] >> 7;
  } while (c < v);
  OD_ASSERT(v < u);
  OD_ASSERT(u <= r);
  r = u - v;
  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-#else
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  v = 0;
-  ret = -1;
-  do {
-    u = v;
-    v = cdf[++ret] * (uint32_t)r >> 15;
-  } while (v <= c);
-  OD_ASSERT(u < v);
-  OD_ASSERT(v <= r);
-  r = v - u;
-  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
-#endif
  return od_ec_dec_normalize(dec, dif, r, ret);
 }

--- a/third_party/aom/aom_dsp/entdec.h
+++ b/third_party/aom/aom_dsp/entdec.h
@ -47,10 +47,8 @@ struct od_ec_dec {
  const unsigned char *end;
  /*The read pointer for the entropy-coded bits.*/
  const unsigned char *bptr;
-  /*The difference between the coded value and the low end of the current
-     range.
-    {EC_SMALLMUL} The difference between the high end of the current range,
-     (low + rng), and the coded value, minus 1.
+  /*The difference between the high end of the current range, (low + rng), and
+     the coded value, minus 1.
    This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
     decoder only uses the top 16 bits of the window to decode the next symbol.
    As we shift up during renormalization, if we don't have enough bits left in
--- a/third_party/aom/aom_dsp/entenc.c
+++ b/third_party/aom/aom_dsp/entenc.c
@ -143,11 +143,10 @@ void od_ec_enc_clear(od_ec_enc *enc) {
 }

 /*Encodes a symbol given its frequency in Q15.
-  fl: The cumulative frequency of all symbols that come before the one to be
-       encoded.
-  fh: The cumulative frequency of all symbols up to and including the one to
-       be encoded.
-  {EC_SMALLMUL} Both values are 32768 minus that.*/
+  fl: 32768 minus the cumulative frequency of all symbols that come before the
+       one to be encoded.
+  fh: 32768 minus the cumulative frequency of all symbols up to and including
+       the one to be encoded.*/
 static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
  od_ec_window l;
  unsigned r;
@ -156,7 +155,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
  l = enc->low;
  r = enc->rng;
  OD_ASSERT(32768U <= r);
-#if CONFIG_EC_SMALLMUL
  OD_ASSERT(fh < fl);
  OD_ASSERT(fl <= 32768U);
  if (fl < 32768U) {
@ -167,14 +165,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
  } else {
    r -= (r >> 8) * (uint32_t)fh >> 7;
  }
-#else
-  OD_ASSERT(fl < fh);
-  OD_ASSERT(fh <= 32768U);
-  u = fl * (uint32_t)r >> 15;
-  v = fh * (uint32_t)r >> 15;
-  r = v - u;
-  l += u;
-#endif
  od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.);
@ -184,8 +174,7 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {

 /*Encode a single binary value.
  val: The value to encode (0 or 1).
-  {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768.
-  {else} f: The probability that val is zero, scaled by 32768.*/
+  f: The probability that the val is one, scaled by 32768.*/
 void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
  od_ec_window l;
  unsigned r;
@ -195,15 +184,9 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
  l = enc->low;
  r = enc->rng;
  OD_ASSERT(32768U <= r);
-#if CONFIG_EC_SMALLMUL
  v = (r >> 8) * (uint32_t)f >> 7;
  if (val) l += r - v;
  r = val ? v : r - v;
-#else
-  v = f * (uint32_t)r >> 15;
-  if (val) l += v;
-  r = val ? r - v : v;
-#endif
  od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
  enc->entropy -=
@ -214,19 +197,19 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {

 /*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
  s: The index of the symbol to encode.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-decreasing, and the last value
-        must be exactly 32768.
+  icdf: 32768 minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+        The values must be monotonically decreasing, and icdf[nsyms - 1] must
+         be 0.
  nsyms: The number of symbols in the alphabet.
         This should be at most 16.*/
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf,
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
                          int nsyms) {
  (void)nsyms;
  OD_ASSERT(s >= 0);
  OD_ASSERT(s < nsyms);
-  OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U));
-  od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]);
+  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]);
 }

 #if CONFIG_RAWBITS
--- a/third_party/aom/aom_dsp/intrapred.c
+++ b/third_party/aom/aom_dsp/intrapred.c
@ -16,6 +16,7 @@
 #include "./aom_dsp_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/intrapred_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"

@ -179,7 +180,6 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
  }
 }

-#if CONFIG_ALT_INTRA
 static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }

 static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
@ -208,40 +208,6 @@ static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
  }
 }

-// Weights are quadratic from '1' to '1 / block_size', scaled by
-// 2^sm_weight_log2_scale.
-static const int sm_weight_log2_scale = 8;
-
-#if CONFIG_TX64X64
-// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
-#define MAX_BLOCK_DIM 64
-#else
-#define MAX_BLOCK_DIM 32
-#endif  // CONFIG_TX64X64
-
-static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
-  // Unused, because we always offset by bs, which is at least 2.
-  0, 0,
-  // bs = 2
-  255, 128,
-  // bs = 4
-  255, 149, 85, 64,
-  // bs = 8
-  255, 197, 146, 105, 73, 50, 37, 32,
-  // bs = 16
-  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-  // bs = 32
-  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-#if CONFIG_TX64X64
-  // bs = 64
-  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
-  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
-  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-#endif  // CONFIG_TX64X64
-};
-
 // Some basic checks on weights for smooth predictor.
 #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
                                 pred_scale)                          \
@ -344,21 +310,6 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
 }
 #endif  // CONFIG_SMOOTH_HV

-#else
-
-static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                                const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  int ytop_left = above[-1];
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
-    dst += stride;
-  }
-}
-#endif  // CONFIG_ALT_INTRA
-
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint8_t *above,
                                    const uint8_t *left) {
@ -794,7 +745,6 @@ void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
  DST(1, 1) = AVG3(J, I, X);
 }

-#if CONFIG_ALT_INTRA
 static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bw, int bh, const uint16_t *above,
                                          const uint16_t *left, int bd) {
@ -901,23 +851,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
    dst += stride;
  }
 }
-#endif
-
-#else
-static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                       int bh, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int r, c;
-  int ytop_left = above[-1];
-  (void)bd;
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
-    dst += stride;
-  }
-}
-#endif  // CONFIG_ALT_INTRA
+#endif  // CONFIG_SMOOTH_HV

 static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bw, int bh,
@ -1017,12 +951,16 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
  intra_pred_sized(type, 16, 8) \
  intra_pred_sized(type, 16, 32) \
  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32) \
  intra_pred_highbd_sized(type, 4, 8) \
  intra_pred_highbd_sized(type, 8, 4) \
  intra_pred_highbd_sized(type, 8, 16) \
  intra_pred_highbd_sized(type, 16, 8) \
  intra_pred_highbd_sized(type, 16, 32) \
-  intra_pred_highbd_sized(type, 32, 16)
+  intra_pred_highbd_sized(type, 32, 16) \
+  intra_pred_highbd_sized(type, 32, 64) \
+  intra_pred_highbd_sized(type, 64, 32)
 #define intra_pred_above_4x4(type) \
  intra_pred_sized(type, 8, 8) \
  intra_pred_sized(type, 16, 16) \
@ -1078,7 +1016,9 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
  intra_pred_sized(type, 8, 16) \
  intra_pred_sized(type, 16, 8) \
  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16)
+  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32)
 #define intra_pred_above_4x4(type) \
  intra_pred_sized(type, 8, 8) \
  intra_pred_sized(type, 16, 16) \
@ -1118,16 +1058,12 @@ intra_pred_above_4x4(d135)
 intra_pred_above_4x4(d153)
 intra_pred_allsizes(v)
 intra_pred_allsizes(h)
-#if CONFIG_ALT_INTRA
 intra_pred_allsizes(smooth)
 #if CONFIG_SMOOTH_HV
 intra_pred_allsizes(smooth_v)
 intra_pred_allsizes(smooth_h)
 #endif  // CONFIG_SMOOTH_HV
 intra_pred_allsizes(paeth)
-#else
-intra_pred_allsizes(tm)
-#endif  // CONFIG_ALT_INTRA
 intra_pred_allsizes(dc_128)
 intra_pred_allsizes(dc_left)
 intra_pred_allsizes(dc_top)
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_INTRAPRED_COMMON_H
+#define _AOM_DSP_INTRAPRED_COMMON_H
+
+#include "./aom_config.h"
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^sm_weight_log2_scale.
+static const int sm_weight_log2_scale = 8;
+
+#if CONFIG_TX64X64
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+#else
+#define MAX_BLOCK_DIM 32
+#endif  // CONFIG_TX64X64
+
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+  // Unused, because we always offset by bs, which is at least 2.
+  0, 0,
+  // bs = 2
+  255, 128,
+  // bs = 4
+  255, 149, 85, 64,
+  // bs = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // bs = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // bs = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+#if CONFIG_TX64X64
+  // bs = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+#endif  // CONFIG_TX64X64
+};
+
+#endif  // _AOM_DSP_INTRAPRED_COMMON_H
--- a/third_party/aom/aom_dsp/inv_txfm.c
+++ b/third_party/aom/aom_dsp/inv_txfm.c
@ -14,7 +14,8 @@

 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
 #include "av1/common/daala_tx.h"
 #endif

@ -96,18 +97,6 @@ void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
  }
 }

-#if CONFIG_DAALA_DCT4
-void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idct4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-#else
-
 void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step[4];
  tran_high_t temp1, temp2;
@ -127,7 +116,6 @@ void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
  output[2] = WRAPLOW(step[1] - step[2]);
  output[3] = WRAPLOW(step[0] - step[3]);
 }
-#endif

 void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[4 * 4];
@ -172,18 +160,6 @@ void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }

-#if CONFIG_DAALA_DCT8
-void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-#else
-
 void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step1[8], step2[8];
  tran_high_t temp1, temp2;
@ -237,7 +213,6 @@ void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
  output[6] = WRAPLOW(step1[1] - step1[6]);
  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
-#endif

 void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[8 * 8];
@ -313,18 +288,6 @@ void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }

-#if CONFIG_DAALA_DCT8
-void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-#else
-
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
  int s0, s1, s2, s3, s4, s5, s6, s7;

@ -402,8 +365,6 @@ void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
  output[7] = WRAPLOW(-x1);
 }

-#endif
-
 void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[8 * 8] = { 0 };
  tran_low_t *outptr = out;
@ -1224,7 +1185,7 @@ void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {

 #if CONFIG_MRC_TX
 void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, int *mask) {
+                              int stride, uint8_t *mask) {
  tran_low_t out[32 * 32];
  tran_low_t *outptr = out;
  int i, j;
@ -1265,7 +1226,7 @@ void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 }

 void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             int *mask) {
+                             uint8_t *mask) {
  tran_low_t out[32 * 32] = { 0 };
  tran_low_t *outptr = out;
  int i, j;
@ -1295,7 +1256,7 @@ void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
 }

 void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            int *mask) {
+                            uint8_t *mask) {
  tran_low_t out[32 * 32] = { 0 };
  tran_low_t *outptr = out;
  int i, j;
--- a/third_party/aom/aom_dsp/inv_txfm.h
+++ b/third_party/aom/aom_dsp/inv_txfm.h
@ -55,19 +55,22 @@ static INLINE tran_high_t check_range(tran_high_t input, int bd) {
 #if CONFIG_MRC_TX
 // These each perform dct but add coefficients based on a mask
 void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, int *mask);
+                              int stride, uint8_t *mask);

 void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             int *mask);
+                             uint8_t *mask);

 void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            int *mask);
+                            uint8_t *mask);
 #endif  // CONFIG_MRC_TX

 void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
+void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
+#endif
 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
--- a/third_party/aom/aom_dsp/loopfilter.c
+++ b/third_party/aom/aom_dsp/loopfilter.c
@ -23,6 +23,14 @@ static INLINE int8_t signed_char_clamp(int t) {
 #define PARALLEL_DEBLOCKING_11_TAP 0
 #define PARALLEL_DEBLOCKING_9_TAP 0

+#if CONFIG_DEBLOCK_13TAP
+#define PARALLEL_DEBLOCKING_13_TAP 1
+#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
+#else
+#define PARALLEL_DEBLOCKING_13_TAP 0
+#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
+#endif
+
 #if CONFIG_HIGHBITDEPTH
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
  switch (bd) {
@ -58,6 +66,19 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
  return ~mask;
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+                                       uint8_t p0, uint8_t q0, uint8_t q1,
+                                       uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  return ~mask;
+}
+#endif
+
 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
                                uint8_t q2, uint8_t q3) {
@ -216,6 +237,25 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
  aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
+  if (flat && mask) {
+    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+#endif
+
 static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
@ -236,6 +276,32 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
  }
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+            s + 2 * p);
+    ++s;
+  }
+}
+#endif
+
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
  int i;
@ -268,6 +334,28 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
  aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+    s += pitch;
+  }
+}
+#endif
+
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                          const uint8_t *limit, const uint8_t *thresh) {
  int i;
@ -297,6 +385,56 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
  aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }

+#if PARALLEL_DEBLOCKING_13_TAP
+static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op6, uint8_t *op5,
+                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
+  if (flat2 && flat && mask) {
+    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
+                  p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+                  q5 = *oq5, q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+#endif
+
 #if PARALLEL_DEBLOCKING_11_TAP
 static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat,
                            int8_t flat2, uint8_t *op5, uint8_t *op4,
@ -428,7 +566,16 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

-#if PARALLEL_DEBLOCKING_11_TAP
+#if PARALLEL_DEBLOCKING_13_TAP
+    (void)p7;
+    (void)q7;
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
+
+#elif PARALLEL_DEBLOCKING_11_TAP
    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);

    filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
@ -482,7 +629,14 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

-#if PARALLEL_DEBLOCKING_11_TAP
+#if PARALLEL_DEBLOCKING_13_TAP
+    (void)p7;
+    (void)q7;
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
+             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
+#elif PARALLEL_DEBLOCKING_11_TAP
    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);

    filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
@ -553,6 +707,21 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
  return ~mask;
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+                                              uint16_t p1, uint16_t p0,
+                                              uint16_t q0, uint16_t q1,
+                                              uint16_t q2, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  return ~mask;
+}
+#endif
+
 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
                                       uint16_t p1, uint16_t p0, uint16_t q0,
                                       uint16_t q1, uint16_t q2, uint16_t q3,
@ -708,6 +877,26 @@ void aom_highbd_lpf_vertical_4_dual_c(
                              bd);
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+                                  int bd) {
+  if (flat && mask) {
+    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+#endif
+
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
@ -754,6 +943,33 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
  }
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
+                   s + 1 * p, s + 2 * p, bd);
+    ++s;
+  }
+}
+#endif
+
 void aom_highbd_lpf_horizontal_8_dual_c(
    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
@ -762,6 +978,30 @@ void aom_highbd_lpf_horizontal_8_dual_c(
  aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }

+#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+                   bd);
+    s += pitch;
+  }
+}
+#endif
+
 void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int bd) {
@ -794,6 +1034,68 @@ void aom_highbd_lpf_vertical_8_dual_c(
                              bd);
 }

+#if PARALLEL_DEBLOCKING_13_TAP
+static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
+                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
+                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
+                                   int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+#endif
+
 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
                                   uint16_t *op5, uint16_t *op4, uint16_t *op3,
@ -887,6 +1189,16 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat =
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+
+#if PARALLEL_DEBLOCKING_13_TAP
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
+                          s[5 * p], s[6 * p], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
+#else
    const int8_t flat2 =
        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
@ -895,6 +1207,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
                    s + 6 * p, s + 7 * p, bd);
+#endif
    ++s;
  }
 }
@ -937,12 +1250,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat =
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#if PARALLEL_DEBLOCKING_13_TAP
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
+                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
+                    s + 6, bd);
+#else
    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
                                           q0, s[4], s[5], s[6], s[7], bd);

    highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
                    s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
                    s + 5, s + 6, s + 7, bd);
+#endif
    s += p;
  }
 }
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
@ -407,6 +407,11 @@ void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  uint32_t tp1, tp2, tn1;
  uint32_t tp3, tp4, tn2;

+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
  /* prefetch data to cache memory */
  prefetch_load(src);
  prefetch_load(src + 32);
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@ -1304,6 +1304,8 @@ void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
  uint32_t pos = 38;

+  (void)x_step_q4;
+
  assert(x_step_q4 == 16);
  assert(y_step_q4 == 16);
  assert(((const int32_t *)filter_x)[1] != 0x800000);
@ -1400,6 +1402,11 @@ void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             int w, int h) {
  int x, y;

+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
  /* prefetch data to cache memory */
  prefetch_load(src);
  prefetch_load(src + 32);
--- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
@ -17,6 +17,8 @@ void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;

+  (void)above;
+
  __asm__ __volatile__(
      "lb         %[tmp1],      (%[left])                    \n\t"
      "lb         %[tmp2],      1(%[left])                   \n\t"
--- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
@ -15,6 +15,7 @@
 void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;

  __asm__ __volatile__(
      "lb         %[tmp1],      (%[left])                    \n\t"
@ -78,148 +79,4 @@ void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
        [stride] "r"(stride));
 }
-
-void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t abovel, abover;
-  int32_t left0, left1, left2, left3;
-  int32_t res0, res1;
-  int32_t resl;
-  int32_t resr;
-  int32_t top_left;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  __asm__ __volatile__(
-      "ulw             %[resl],       (%[above])                         \n\t"
-
-      "lbu             %[left0],       (%[left])                         \n\t"
-      "lbu             %[left1],       1(%[left])                        \n\t"
-      "lbu             %[left2],       2(%[left])                        \n\t"
-      "lbu             %[left3],       3(%[left])                        \n\t"
-
-      "lbu             %[top_left],    -1(%[above])                      \n\t"
-
-      "preceu.ph.qbl   %[abovel],      %[resl]                           \n\t"
-      "preceu.ph.qbr   %[abover],      %[resl]                           \n\t"
-
-      "replv.ph        %[left0],       %[left0]                          \n\t"
-      "replv.ph        %[left1],       %[left1]                          \n\t"
-      "replv.ph        %[left2],       %[left2]                          \n\t"
-      "replv.ph        %[left3],       %[left3]                          \n\t"
-
-      "replv.ph        %[top_left],    %[top_left]                       \n\t"
-
-      "addu.ph         %[resl],        %[abovel],         %[left0]       \n\t"
-      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
-
-      "addu.ph         %[resr],        %[abover],         %[left0]       \n\t"
-      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
-
-      "sll             %[res0],        %[resr],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sra             %[res1],        %[resr],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-      "sb              %[res0],        (%[dst])                          \n\t"
-
-      "sll             %[res0],        %[resl],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-      "sb              %[res1],        1(%[dst])                         \n\t"
-
-      "sra             %[res1],        %[resl],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-
-      "addu.ph         %[resl],        %[abovel],         %[left1]       \n\t"
-      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
-
-      "addu.ph         %[resr],        %[abover],         %[left1]       \n\t"
-      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
-
-      "sb              %[res0],        2(%[dst])                         \n\t"
-      "sb              %[res1],        3(%[dst])                         \n\t"
-
-      "add             %[dst],          %[dst],           %[stride]      \n\t"
-
-      "sll             %[res0],        %[resr],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sra             %[res1],        %[resr],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-      "sb              %[res0],        (%[dst])                          \n\t"
-
-      "sll             %[res0],        %[resl],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sb              %[res1],        1(%[dst])                         \n\t"
-      "sra             %[res1],        %[resl],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-
-      "addu.ph         %[resl],        %[abovel],         %[left2]       \n\t"
-      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
-
-      "addu.ph         %[resr],        %[abover],         %[left2]       \n\t"
-      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
-
-      "sb              %[res0],        2(%[dst])                         \n\t"
-      "sb              %[res1],        3(%[dst])                         \n\t"
-
-      "add             %[dst],          %[dst],           %[stride]      \n\t"
-
-      "sll             %[res0],        %[resr],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sra             %[res1],        %[resr],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-      "sb              %[res0],        (%[dst])                          \n\t"
-
-      "sll             %[res0],        %[resl],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sb              %[res1],        1(%[dst])                         \n\t"
-      "sra             %[res1],        %[resl],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-
-      "addu.ph         %[resl],        %[abovel],        %[left3]        \n\t"
-      "subu.ph         %[resl],        %[resl],          %[top_left]     \n\t"
-
-      "addu.ph         %[resr],        %[abover],        %[left3]        \n\t"
-      "subu.ph         %[resr],        %[resr],          %[top_left]     \n\t"
-
-      "sb              %[res0],        2(%[dst])                         \n\t"
-      "sb              %[res1],        3(%[dst])                         \n\t"
-
-      "add             %[dst],          %[dst],          %[stride]       \n\t"
-
-      "sll             %[res0],        %[resr],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-
-      "sra             %[res1],        %[resr],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-      "sb              %[res0],        (%[dst])                          \n\t"
-
-      "sll             %[res0],        %[resl],           16             \n\t"
-      "sra             %[res0],        %[res0],           16             \n\t"
-      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
-      "sb              %[res1],        1(%[dst])                         \n\t"
-
-      "sra             %[res1],        %[resl],           16             \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
-
-      "sb              %[res0],        2(%[dst])                         \n\t"
-      "sb              %[res1],        3(%[dst])                         \n\t"
-
-      : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
-        [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
-        [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
-        [resr] "=&r"(resr), [top_left] "=&r"(top_left)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride), [cm] "r"(cm));
-}
 #endif  // #if HAVE_DSPR2
--- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
@ -15,6 +15,7 @@
 void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;

  __asm__ __volatile__(
      "lb         %[tmp1],      (%[left])                   \n\t"
@ -146,458 +147,4 @@ void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
        [stride] "r"(stride));
 }
-
-void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t abovel, abover;
-  int32_t abovel_1, abover_1;
-  int32_t left0;
-  int32_t res0, res1, res2, res3;
-  int32_t reshw;
-  int32_t top_left;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  __asm__ __volatile__(
-      "ulw             %[reshw],       (%[above])                         \n\t"
-      "ulw             %[top_left],    4(%[above])                        \n\t"
-
-      "lbu             %[left0],       (%[left])                          \n\t"
-
-      "preceu.ph.qbl   %[abovel],      %[reshw]                           \n\t"
-      "preceu.ph.qbr   %[abover],      %[reshw]                           \n\t"
-      "preceu.ph.qbl   %[abovel_1],    %[top_left]                        \n\t"
-      "preceu.ph.qbr   %[abover_1],    %[top_left]                        \n\t"
-
-      "lbu             %[top_left],    -1(%[above])                       \n\t"
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-
-      "replv.ph        %[top_left],    %[top_left]                        \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       1(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       2(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       3(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       4(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       5(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       6(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbu             %[left0],       7(%[left])                         \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      "replv.ph        %[left0],       %[left0]                           \n\t"
-      "add             %[dst],          %[dst],             %[stride]     \n\t"
-
-      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        (%[dst])                           \n\t"
-      "sb              %[res1],        1(%[dst])                          \n\t"
-      "sb              %[res2],        2(%[dst])                          \n\t"
-      "sb              %[res3],        3(%[dst])                          \n\t"
-
-      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res2],        %[reshw],            16            \n\t"
-      "sra             %[res2],        %[res2],             16            \n\t"
-      "sra             %[res3],        %[reshw],            16            \n\t"
-
-      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
-      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
-
-      "sll             %[res0],        %[reshw],            16            \n\t"
-      "sra             %[res0],        %[res0],             16            \n\t"
-      "sra             %[res1],        %[reshw],            16            \n\t"
-
-      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
-      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
-      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
-      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
-
-      "sb              %[res0],        4(%[dst])                          \n\t"
-      "sb              %[res1],        5(%[dst])                          \n\t"
-      "sb              %[res2],        6(%[dst])                          \n\t"
-      "sb              %[res3],        7(%[dst])                          \n\t"
-
-      : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
-        [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
-        [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
-        [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
-        [top_left] "=&r"(top_left)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride), [cm] "r"(cm));
-}
 #endif  // #if HAVE_DSPR2
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@ -382,176 +382,6 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
  }
 }

-static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint32_t val;
-  uint8_t top_left = src_top_ptr[-1];
-  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
-  v16u8 src0, src1, src2, src3;
-  v8u16 src_top_left, vec0, vec1, vec2, vec3;
-
-  src_top_left = (v8u16)__msa_fill_h(top_left);
-  val = LW(src_top_ptr);
-  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
-
-  src_left0 = __msa_fill_b(src_left[0]);
-  src_left1 = __msa_fill_b(src_left[1]);
-  src_left2 = __msa_fill_b(src_left[2]);
-  src_left3 = __msa_fill_b(src_left[3]);
-
-  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
-             src_left3, src_top, src0, src1, src2, src3);
-  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
-  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
-  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
-  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
-}
-
-static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint64_t val;
-  uint8_t top_left = src_top_ptr[-1];
-  uint32_t loop_cnt;
-  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
-  v8u16 src_top_left, vec0, vec1, vec2, vec3;
-  v16u8 src0, src1, src2, src3;
-
-  val = LD(src_top_ptr);
-  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
-  src_top_left = (v8u16)__msa_fill_h(top_left);
-
-  for (loop_cnt = 2; loop_cnt--;) {
-    src_left0 = __msa_fill_b(src_left[0]);
-    src_left1 = __msa_fill_b(src_left[1]);
-    src_left2 = __msa_fill_b(src_left[2]);
-    src_left3 = __msa_fill_b(src_left[3]);
-    src_left += 4;
-
-    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
-               src_left3, src_top, src0, src1, src2, src3);
-    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
-    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
-    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint8_t top_left = src_top_ptr[-1];
-  uint32_t loop_cnt;
-  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
-  v8u16 src_top_left, res_r, res_l;
-
-  src_top = LD_SB(src_top_ptr);
-  src_top_left = (v8u16)__msa_fill_h(top_left);
-
-  for (loop_cnt = 4; loop_cnt--;) {
-    src_left0 = __msa_fill_b(src_left[0]);
-    src_left1 = __msa_fill_b(src_left[1]);
-    src_left2 = __msa_fill_b(src_left[2]);
-    src_left3 = __msa_fill_b(src_left[3]);
-    src_left += 4;
-
-    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
-    HADD_UB2_UH(res_r, res_l, res_r, res_l);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
-
-    SAT_UH2_UH(res_r, res_l, 7);
-    PCKEV_ST_SB(res_r, res_l, dst);
-    dst += dst_stride;
-
-    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
-    HADD_UB2_UH(res_r, res_l, res_r, res_l);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
-    SAT_UH2_UH(res_r, res_l, 7);
-    PCKEV_ST_SB(res_r, res_l, dst);
-    dst += dst_stride;
-
-    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
-    HADD_UB2_UH(res_r, res_l, res_r, res_l);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
-    SAT_UH2_UH(res_r, res_l, 7);
-    PCKEV_ST_SB(res_r, res_l, dst);
-    dst += dst_stride;
-
-    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
-    HADD_UB2_UH(res_r, res_l, res_r, res_l);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
-    SAT_UH2_UH(res_r, res_l, 7);
-    PCKEV_ST_SB(res_r, res_l, dst);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint8_t top_left = src_top[-1];
-  uint32_t loop_cnt;
-  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
-  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
-
-  LD_SB2(src_top, 16, src_top0, src_top1);
-  src_top_left = (v8u16)__msa_fill_h(top_left);
-
-  for (loop_cnt = 8; loop_cnt--;) {
-    src_left0 = __msa_fill_b(src_left[0]);
-    src_left1 = __msa_fill_b(src_left[1]);
-    src_left2 = __msa_fill_b(src_left[2]);
-    src_left3 = __msa_fill_b(src_left[3]);
-    src_left += 4;
-
-    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
-    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
-    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
-    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
-    PCKEV_ST_SB(res_r0, res_l0, dst);
-    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
-    dst += dst_stride;
-
-    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
-    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
-    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
-    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
-    PCKEV_ST_SB(res_r0, res_l0, dst);
-    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
-    dst += dst_stride;
-
-    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
-    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
-    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
-    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
-    PCKEV_ST_SB(res_r0, res_l0, dst);
-    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
-    dst += dst_stride;
-
-    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
-    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
-    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
-    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
-    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
-    PCKEV_ST_SB(res_r0, res_l0, dst);
-    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
-    dst += dst_stride;
-  }
-}
-
 void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
                             const uint8_t *above, const uint8_t *left) {
  (void)left;
@ -717,23 +547,3 @@ void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,

  intra_predict_128dc_32x32_msa(dst, y_stride);
 }
-
-void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
-}
-
-void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
-}
-
-void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
-}
-
-void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
-}
--- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
@ -24,10 +24,12 @@ extern "C" {
 #endif

 #if HAVE_DSPR2
+/* Note: this macro expects a local int32_t named out to exist, and will write
+ * to that variable. */
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
  ({                                                                           \
                                                                               \
-    int32_t tmp, out;                                                          \
+    int32_t tmp;                                                               \
    int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
    int in = input;                                                            \
                                                                               \
--- a/third_party/aom/aom_dsp/prob.h
+++ b/third_party/aom/aom_dsp/prob.h
@ -46,6 +46,14 @@ typedef uint16_t aom_cdf_prob;

 #define MAX_PROB 255

+#define LV_MAP_PROB 1
+
+#define BR_NODE 1
+
+#if CONFIG_ADAPT_SCAN
+#define CACHE_SCAN_PROB 1
+#endif
+
 #define aom_prob_half ((aom_prob)128)

 typedef int8_t aom_tree_index;
@ -149,7 +157,11 @@ static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
 void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree);

 static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
-  const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs);
+  int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs);
+#if CONFIG_LV_MAP
+  if (nsymbs == 2)
+    rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs);
+#endif
  const int rate2 = 5;
  int i, tmp;
  int diff;
@ -158,7 +170,7 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
  tmp = AOM_ICDF(tmp0);
  diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate;
 // Single loop (faster)
-#if !CONFIG_ANS && CONFIG_EC_SMALLMUL
+#if !CONFIG_ANS
  for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) {
    tmp -= (i == val ? diff : 0);
    cdf[i] += ((tmp - cdf[i]) >> rate);
@ -183,6 +195,12 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
  cdf[nsymbs] += (cdf[nsymbs] < 32);
 }

+#if CONFIG_LV_MAP
+static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) {
+  update_cdf(cdf, val, nsymbs);
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/third_party/aom/aom_dsp/psnr.c
+++ b/third_party/aom/aom_dsp/psnr.c
@ -289,6 +289,27 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
 }
 #endif  // CONFIG_HIGHBITDEPTH

+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+#if CONFIG_HIGHBITDEPTH
+  if (highbd) {
+    switch (plane) {
+      case 0: return aom_highbd_get_y_sse(a, b);
+      case 1: return aom_highbd_get_u_sse(a, b);
+      case 2: return aom_highbd_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  }
+#endif
+  (void)highbd;
+  switch (plane) {
+    case 0: return aom_get_y_sse(a, b);
+    case 1: return aom_get_u_sse(a, b);
+    case 2: return aom_get_v_sse(a, b);
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+}
+
 #if CONFIG_HIGHBITDEPTH
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
@ -296,9 +317,7 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
                           a->uv_crop_height };
-  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
-  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
  int i;
  uint64_t total_sse = 0;
@ -313,14 +332,15 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
    uint64_t sse;
    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
      if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i],
+        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
                                   b_strides[i], w, h, input_shift);
      } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i],
+        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
                             b_strides[i], w, h);
      }
    } else {
-      sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
+                    h);
    }
    psnr->sse[1 + i] = sse;
    psnr->samples[1 + i] = samples;
@ -344,9 +364,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
                           a->uv_crop_height };
-  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
-  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
  int i;
  uint64_t total_sse = 0;
@ -357,7 +375,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
    const int h = heights[i];
    const uint32_t samples = w * h;
    const uint64_t sse =
-        get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
    psnr->sse[1 + i] = sse;
    psnr->samples[1 + i] = samples;
    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
--- a/third_party/aom/aom_dsp/psnr.h
+++ b/third_party/aom/aom_dsp/psnr.h
@ -47,6 +47,8 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
                           int vstart, int height);
 int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
 #if CONFIG_HIGHBITDEPTH
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                  const YV12_BUFFER_CONFIG *b, int hstart,
--- a/third_party/aom/aom_dsp/quantize.c
+++ b/third_party/aom/aom_dsp/quantize.c
@ -12,18 +12,14 @@
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"

-static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan,
-#if CONFIG_AOM_QM
-                                const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-#endif
-                                const int log_scale) {
+void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan, const qm_val_t *qm_ptr,
+                         const qm_val_t *iqm_ptr, const int log_scale) {
  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
@ -37,20 +33,12 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    // Pre-scan pass
    for (i = (int)n_coeffs - 1; i >= 0; i--) {
      const int rc = scan[i];
-#if CONFIG_AOM_QM
-      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
      const int coeff = coeff_ptr[rc] * wt;
-#else
-      const int coeff = coeff_ptr[rc];
-#endif  // CONFIG_AOM_QM

-#if CONFIG_AOM_QM
-      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
-          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
        non_zero_count--;
-#else
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--;
-#endif  // CONFIG_AOM_QM
      else
        break;
    }
@ -64,35 +52,21 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
      int tmp32;

-#if CONFIG_AOM_QM
-      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-#else
-      if (abs_coeff >= zbins[rc != 0]) {
-#endif  // CONFIG_AOM_QM
        int64_t tmp =
            clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
                  INT16_MIN, INT16_MAX);
-#if CONFIG_AOM_QM
        tmp *= wt;
        tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
                       quant_shift_ptr[rc != 0]) >>
                      (16 - log_scale + AOM_QM_BITS));  // quantization
-#else
-        tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                       quant_shift_ptr[rc != 0]) >>
-                      (16 - log_scale));  // quantization
-#endif  // CONFIG_AOM_QM
        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-#if CONFIG_AOM_QM
+        const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
        const int dequant =
-            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+            (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
            AOM_QM_BITS;
        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
-#else
-        dqcoeff_ptr[rc] =
-            qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale);
-#endif  // CONFIG_AOM_QM

        if (tmp32) eob = i;
      }
@ -101,25 +75,112 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+#if CONFIG_TX64X64
+  int idx_arr[4096];
+#else
+  int idx_arr[1024];
+#endif
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+          coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                   (16 - log_scale + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+                AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t quant, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                        const qm_val_t *iqm_ptr, const int log_scale) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp;
+  int eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+/* These functions should only be called when quantisation matrices
+   are not used. */
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      int skip_block, const int16_t *zbin_ptr,
                      const int16_t *round_ptr, const int16_t *quant_ptr,
                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan
-#if CONFIG_AOM_QM
-                      ,
-                      const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                      ) {
+                      const int16_t *iscan) {
  quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                      quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                      dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                      qm_ptr, iqm_ptr,
-#endif
-                      0);
+                      dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
 }

 void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@ -128,19 +189,10 @@ void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const int16_t *quant_shift_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan
-#if CONFIG_AOM_QM
-                            ,
-                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                            ) {
+                            const int16_t *scan, const int16_t *iscan) {
  quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                      quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                      dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                      qm_ptr, iqm_ptr,
-#endif
-                      1);
+                      dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
 }

 #if CONFIG_TX64X64
@ -150,427 +202,28 @@ void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const int16_t *quant_shift_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan
-#if CONFIG_AOM_QM
-                            ,
-                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                            ) {
+                            const int16_t *scan, const int16_t *iscan) {
  quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                      quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                      dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                      qm_ptr, iqm_ptr,
-#endif
-                      2);
+                      dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
 }
 #endif  // CONFIG_TX64X64

-#if CONFIG_AOM_QM
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr,
-                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp, eob = -1;
-  int32_t tmp32;
-  int dequant =
-      (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp, eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant =
-        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp, eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
-                INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant =
-        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                            const qm_val_t *iqm_ptr) {
-  int eob = -1;
-  int dequant =
-      (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
-                                  const qm_val_t *qm_ptr,
-                                  const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dequant =
-        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
-                                  const qm_val_t *qm_ptr,
-                                  const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS));
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dequant =
-        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
-          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmpw = tmp1 * wt;
-        const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dequant =
-            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-            AOM_QM_BITS;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-        if (abs_qcoeff) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void aom_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
-          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const qm_val_t wt = qm_ptr[rc];
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmpw = tmp1 * wt;
-      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dequant =
-          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void aom_highbd_quantize_b_64x64_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[4096];
-  int i, eob = -1;
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
-          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const qm_val_t wt = qm_ptr[rc];
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
-      const int64_t tmpw = tmp1 * wt;
-      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dequant =
-          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-#else  // CONFIG_AOM_QM
-
 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                     const int16_t *round_ptr, const int16_t quant,
                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp) eob = 0;
-  }
-  *eob_ptr = eob + 1;
+  quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant,
+                     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL,
+                     0);
 }

 void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                           const int16_t *round_ptr, const int16_t quant,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp) eob = 0;
-  }
-  *eob_ptr = eob + 1;
+  quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr,
+                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1);
 }

 #if CONFIG_TX64X64
@ -578,100 +231,8 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
                           const int16_t *round_ptr, const int16_t quant,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 4096;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 14;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4;
-    if (tmp) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 4096;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
+  quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr,
+                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2);
 }
 #endif  // CONFIG_TX64X64

@ -682,45 +243,10 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
+                             round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 0);
 }

 void aom_highbd_quantize_b_32x32_c(
@ -729,47 +255,10 @@ void aom_highbd_quantize_b_32x32_c(
    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
+                             round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 1);
 }

 #if CONFIG_TX64X64
@ -779,47 +268,9 @@ void aom_highbd_quantize_b_64x64_c(
    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[4096];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
+                             round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 2);
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_AOM_QM
--- a/third_party/aom/aom_dsp/quantize.h
+++ b/third_party/aom/aom_dsp/quantize.h
@ -19,32 +19,57 @@
 extern "C" {
 #endif

-#if CONFIG_AOM_QM
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr,
-                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-#endif  // CONFIG_TX64X64
+void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan, const qm_val_t *qm_ptr,
+                         const qm_val_t *iqm_ptr, const int log_scale);
+
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      int skip_block, const int16_t *zbin_ptr,
                      const int16_t *round_ptr, const int16_t *quant_ptr,
                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan, const qm_val_t *qm_ptr,
-                      const qm_val_t *iqm_ptr);
+                      const int16_t *iscan);
+
+void highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+#if CONFIG_HIGHBITDEPTH
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+#endif
+
+void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_AOM_QM
 #if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                            int skip_block, const int16_t *round_ptr,
@ -64,32 +89,10 @@ void aom_highbd_quantize_dc_64x64(
    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
    const qm_val_t *iqm_ptr);
 #endif  // CONFIG_TX64X64
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
 #endif  // CONFIG_HIGHBITDEPTH

 #else  // CONFIG_AOM_QM

-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif  // CONFIG_TX64X64
 #if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                            int skip_block, const int16_t *round_ptr,
--- a/third_party/aom/aom_dsp/sad.c
+++ b/third_party/aom/aom_dsp/sad.c
@ -163,11 +163,19 @@ sadMxN(8, 32)
 sadMxNx4D(8, 32)
 sadMxN(32, 8)
 sadMxNx4D(32, 8)
+sadMxN(16, 64)
+sadMxNx4D(16, 64)
+sadMxN(64, 16)
+sadMxNx4D(64, 16)
+sadMxN(32, 128)
+sadMxNx4D(32, 128)
+sadMxN(128, 32)
+sadMxNx4D(128, 32)
 #endif
 /* clang-format on */

 #if CONFIG_HIGHBITDEPTH
-                static INLINE
+                            static INLINE
    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
                            int b_stride, int width, int height) {
  int y, x;
@ -328,12 +336,20 @@ highbd_sadMxN(8, 32)
 highbd_sadMxNx4D(8, 32)
 highbd_sadMxN(32, 8)
 highbd_sadMxNx4D(32, 8)
+highbd_sadMxN(16, 64)
+highbd_sadMxNx4D(16, 64)
+highbd_sadMxN(64, 16)
+highbd_sadMxNx4D(64, 16)
+highbd_sadMxN(32, 128)
+highbd_sadMxNx4D(32, 128)
+highbd_sadMxN(128, 32)
+highbd_sadMxNx4D(128, 32)
 #endif
 /* clang-format on */
 #endif  // CONFIG_HIGHBITDEPTH

-#if CONFIG_AV1 && CONFIG_EXT_INTER
-                            static INLINE
+#if CONFIG_AV1
+                                                static INLINE
    unsigned int masked_sad(const uint8_t *src, int src_stride,
                            const uint8_t *a, int a_stride, const uint8_t *b,
                            int b_stride, const uint8_t *m, int m_stride,
@ -395,11 +411,15 @@ MASKSADMxN(4, 16)
 MASKSADMxN(16, 4)
 MASKSADMxN(8, 32)
 MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+MASKSADMxN(32, 128)
+MASKSADMxN(128, 32)
 #endif
 /* clang-format on */

 #if CONFIG_HIGHBITDEPTH
-                            static INLINE
+                                static INLINE
    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
                                   const uint8_t *a8, int a_stride,
                                   const uint8_t *b8, int b_stride,
@ -464,9 +484,13 @@ HIGHBD_MASKSADMXN(4, 16)
 HIGHBD_MASKSADMXN(16, 4)
 HIGHBD_MASKSADMXN(8, 32)
 HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+HIGHBD_MASKSADMXN(32, 128)
+HIGHBD_MASKSADMXN(128, 32)
 #endif
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+#endif  // CONFIG_AV1

 #if CONFIG_AV1 && CONFIG_MOTION_VAR
 // pre: predictor being evaluated
@ -522,11 +546,15 @@ OBMCSADMxN(4, 16)
 OBMCSADMxN(16, 4)
 OBMCSADMxN(8, 32)
 OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+OBMCSADMxN(32, 128)
+OBMCSADMxN(128, 32)
 #endif
 /* clang-format on */

 #if CONFIG_HIGHBITDEPTH
-                            static INLINE
+                                static INLINE
    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
                                 const int32_t *wsrc, const int32_t *mask,
                                 int width, int height) {
@ -578,6 +606,10 @@ HIGHBD_OBMCSADMXN(4, 16)
 HIGHBD_OBMCSADMXN(16, 4)
 HIGHBD_OBMCSADMXN(8, 32)
 HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+HIGHBD_OBMCSADMXN(32, 128)
+HIGHBD_OBMCSADMXN(128, 32)
 #endif
 /* clang-format on */
 #endif  // CONFIG_HIGHBITDEPTH
--- a/third_party/aom/aom_dsp/ssim.c
+++ b/third_party/aom/aom_dsp/ssim.c
@ -168,23 +168,16 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,

 double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
                     const YV12_BUFFER_CONFIG *dest, double *weight) {
-  double a, b, c;
-  double ssimv;
-
-  a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
-                dest->y_stride, source->y_crop_width, source->y_crop_height);
-
-  b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
-                dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
-
-  c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
-                dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
-
-  ssimv = a * .8 + .1 * (b + c);
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
+                       source->strides[is_uv], dest->strides[is_uv],
+                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
+  }

  *weight = 1;
-
-  return ssimv;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }

 // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
@ -433,30 +426,19 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                            const YV12_BUFFER_CONFIG *dest, double *weight,
                            uint32_t bd, uint32_t in_bd) {
-  double a, b, c;
-  double ssimv;
-  uint32_t shift = 0;
-
  assert(bd >= in_bd);
-  shift = bd - in_bd;
+  const uint32_t shift = bd - in_bd;

-  a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
-                       dest->y_stride, source->y_crop_width,
-                       source->y_crop_height, in_bd, shift);
-
-  b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
-                       dest->uv_stride, source->uv_crop_width,
-                       source->uv_crop_height, in_bd, shift);
-
-  c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
-                       dest->uv_stride, source->uv_crop_width,
-                       source->uv_crop_height, in_bd, shift);
-
-  ssimv = a * .8 + .1 * (b + c);
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                              source->strides[is_uv], dest->strides[is_uv],
+                              source->crop_widths[is_uv],
+                              source->crop_heights[is_uv], in_bd, shift);
+  }

  *weight = 1;
-
-  return ssimv;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }
-
 #endif  // CONFIG_HIGHBITDEPTH
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ b/third_party/aom/aom_dsp/txfm_common.h
@ -13,6 +13,7 @@
 #define AOM_DSP_TXFM_COMMON_H_

 #include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"

 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
@ -23,18 +24,25 @@

 typedef struct txfm_param {
  // for both forward and inverse transforms
-  int tx_type;
-  int tx_size;
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
  int lossless;
  int bd;
 #if CONFIG_MRC_TX || CONFIG_LGT
+  int is_inter;
+#endif  // CONFIG_MRC_TX || CONFIG_LGT
+#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
  int stride;
  uint8_t *dst;
-#endif  // CONFIG_MRC_TX || CONFIG_LGT
-#if CONFIG_LGT
-  int is_inter;
+#if CONFIG_MRC_TX
+  int *valid_mask;
+  uint8_t *mask;
+#endif  // CONFIG_MRC_TX
+#if CONFIG_LGT_FROM_PRED
  int mode;
-#endif
+  int use_lgt;
+#endif  // CONFIG_LGT_FROM_PRED
+#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
 // for inverse transforms only
 #if CONFIG_ADAPT_SCAN
  const int16_t *eob_threshold;
@ -87,27 +95,608 @@ static const tran_high_t sinpi_4_9 = 15212;

 // 16384 * sqrt(2)
 static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;

 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  return rv;
 }

-#if CONFIG_LGT
-// The Line Graph Transforms (LGTs) matrices are written as follows.
-// Each 2D array is 16384 times an LGT matrix, which is the matrix of
-// eigenvectors of the graph Laplacian matrices for the line graph.
+#if CONFIG_LGT_FROM_PRED
+// Use negative numbers so they do not coincide with lgt*[0][0], which are
+// always nonnegative.
+typedef enum {
+  DCT4 = -1,
+  ADST4 = -2,
+  DCT8 = -3,
+  ADST8 = -4,
+  DCT16 = -5,
+  ADST16 = -6,
+  DCT32 = -7,
+  ADST32 = -8,
+} ButterflyLgt;

-// LGT4 name: lgt4_140
-// Self loops: 1.400, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_140[4][4] = {
-  { 4206, 9518, 13524, 15674 },
-  { 11552, 14833, 1560, -13453 },
-  { 15391, -1906, -14393, 9445 },
-  { 12201, -14921, 12016, -4581 },
+/* These are some LGTs already implementated in the codec. When any of them
+ * is chosen, the flgt or ilgt function will call the existing fast
+ * transform instead of the matrix product implementation. Thus, we
+ * do not need the actual basis functions here */
+static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } };
+static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } };
+static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } };
+static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } };
+static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } };
+static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } };
+static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } };
+static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } };
+
+/* The Line Graph Transforms (LGTs) matrices are written as follows.
+   Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the
+   matrix of eigenvectors of the graph Laplacian matrix of the associated
+   line graph. Some of those transforms have fast algorithms but not
+   implemented yet for now. */
+
+// LGT4 name: lgt4_150_000w3
+// Self loops: 1.500, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000
+static const tran_high_t lgt4_150_000w3[4][4] = {
+  { 0, 0, 0, 23170 },
+  { 5991, 13537, 17825, 0 },
+  { 15515, 10788, -13408, 0 },
+  { 16133, -15403, 6275, 0 },
 };

+// LGT4 name: lgt4_100_000w3
+// Self loops: 1.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000
+static const tran_high_t lgt4_100_000w3[4][4] = {
+  { 0, 0, 0, 23170 },
+  { 7600, 13694, 17076, 0 },
+  { 17076, 7600, -13694, 0 },
+  { 13694, -17076, 7600, 0 },
+};
+
+// LGT4 name: lgt4_060_000w3
+// Self loops: 0.600, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000
+static const tran_high_t lgt4_060_000w3[4][4] = {
+  { 0, 0, 0, 23170 },
+  { 9449, 13755, 16075, 0 },
+  { 17547, 4740, -14370, 0 },
+  { 11819, -18034, 8483, 0 },
+};
+
+// LGT4 name: lgt4_000w3
+// Self loops: 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000
+static const tran_high_t lgt4_000w3[4][4] = {
+  { 0, 0, 0, 23170 },
+  { 13377, 13377, 13377, 0 },
+  { 16384, 0, -16384, 0 },
+  { 9459, -18919, 9459, 0 },
+};
+
+// LGT4 name: lgt4_150_000w2
+// Self loops: 1.500, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000
+static const tran_high_t lgt4_150_000w2[4][4] = {
+  { 10362, 20724, 0, 0 },
+  { 20724, -10362, 0, 0 },
+  { 0, 0, 16384, 16384 },
+  { 0, 0, 16384, -16384 },
+};
+
+// LGT4 name: lgt4_100_000w2
+// Self loops: 1.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000
+static const tran_high_t lgt4_100_000w2[4][4] = {
+  { 12181, 19710, 0, 0 },
+  { 19710, -12181, 0, 0 },
+  { 0, 0, 16384, 16384 },
+  { 0, 0, 16384, -16384 },
+};
+
+// LGT4 name: lgt4_060_000w2
+// Self loops: 0.600, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000
+static const tran_high_t lgt4_060_000w2[4][4] = {
+  { 13831, 18590, 0, 0 },
+  { 18590, -13831, 0, 0 },
+  { 0, 0, 16384, 16384 },
+  { 0, 0, 16384, -16384 },
+};
+
+// LGT4 name: lgt4_000w2
+// Self loops: 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000
+static const tran_high_t lgt4_000w2[4][4] = {
+  { 16384, 16384, 0, 0 },
+  { 16384, -16384, 0, 0 },
+  { 0, 0, 16384, 16384 },
+  { 0, 0, 16384, -16384 },
+};
+
+// LGT4 name: lgt4_150_000w1
+// Self loops: 1.500, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000
+static const tran_high_t lgt4_150_000w1[4][4] = {
+  { 23170, 0, 0, 0 },
+  { 0, 13377, 13377, 13377 },
+  { 0, 16384, 0, -16384 },
+  { 0, 9459, -18919, 9459 },
+};
+
+// LGT4 name: lgt4_100_000w1
+// Self loops: 1.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000
+static const tran_high_t lgt4_100_000w1[4][4] = {
+  { 23170, 0, 0, 0 },
+  { 0, 13377, 13377, 13377 },
+  { 0, 16384, 0, -16384 },
+  { 0, 9459, -18919, 9459 },
+};
+
+// LGT4 name: lgt4_060_000w1
+// Self loops: 0.600, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000
+static const tran_high_t lgt4_060_000w1[4][4] = {
+  { 23170, 0, 0, 0 },
+  { 0, 13377, 13377, 13377 },
+  { 0, 16384, 0, -16384 },
+  { 0, 9459, -18919, 9459 },
+};
+
+// LGT4 name: lgt4_000w1
+// Self loops: 0.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000
+static const tran_high_t lgt4_000w1[4][4] = {
+  { 23170, 0, 0, 0 },
+  { 0, 13377, 13377, 13377 },
+  { 0, 16384, 0, -16384 },
+  { 0, 9459, -18919, 9459 },
+};
+
+// LGT4 name: lgt4_060
+// Self loops: 0.600, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000
+static const tran_high_t lgt4_060[4][4] = {
+  { 6971, 10504, 13060, 14400 },
+  { 14939, 11211, -2040, -13559 },
+  { 14096, -8258, -12561, 10593 },
+  { 8150, -15253, 14295, -5784 },
+};
+
+// LGT4 name: lgt4_150
+// Self loops: 1.500, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000
+static const tran_high_t lgt4_150[4][4] = {
+  { 3998, 9435, 13547, 15759 },
+  { 11106, 15105, 1886, -13483 },
+  { 15260, -1032, -14674, 9361 },
+  { 12833, -14786, 11596, -4372 },
+};
+
+// LGT8 name: lgt8_150_000w7
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
+static const tran_high_t lgt8_150_000w7[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 0, 32768 },
+  { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 },
+  { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 },
+  { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 },
+  { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 },
+  { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 },
+  { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 },
+  { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 },
+};
+
+// LGT8 name: lgt8_100_000w7
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
+static const tran_high_t lgt8_100_000w7[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 0, 32768 },
+  { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 },
+  { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 },
+  { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 },
+  { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 },
+  { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 },
+  { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 },
+  { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 },
+};
+
+// LGT8 name: lgt8_060_000w7
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
+static const tran_high_t lgt8_060_000w7[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 0, 32768 },
+  { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 },
+  { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 },
+  { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 },
+  { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 },
+  { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 },
+  { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 },
+  { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 },
+};
+
+// LGT8 name: lgt8_000w7
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
+static const tran_high_t lgt8_000w7[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 0, 32768 },
+  { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 },
+  { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 },
+  { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 },
+  { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 },
+  { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 },
+  { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 },
+  { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 },
+};
+
+// LGT8 name: lgt8_150_000w6
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
+static const tran_high_t lgt8_150_000w6[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
+  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
+  { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 },
+  { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 },
+  { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 },
+  { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 },
+  { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 },
+  { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 },
+};
+
+// LGT8 name: lgt8_100_000w6
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
+static const tran_high_t lgt8_100_000w6[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
+  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
+  { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 },
+  { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 },
+  { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 },
+  { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 },
+  { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 },
+  { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 },
+};
+
+// LGT8 name: lgt8_060_000w6
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
+static const tran_high_t lgt8_060_000w6[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
+  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
+  { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 },
+  { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 },
+  { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 },
+  { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 },
+  { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 },
+  { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 },
+};
+
+// LGT8 name: lgt8_000w6
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
+static const tran_high_t lgt8_000w6[8][8] = {
+  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
+  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
+  { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 },
+  { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 },
+  { 16384, 0, -16384, -16384, 0, 16384, 0, 0 },
+  { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 },
+  { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 },
+  { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 },
+};
+
+// LGT8 name: lgt8_150_000w5
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
+static const tran_high_t lgt8_150_000w5[8][8] = {
+  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
+  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
+  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
+  { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 },
+  { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 },
+  { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 },
+  { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 },
+  { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 },
+};
+
+// LGT8 name: lgt8_100_000w5
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
+static const tran_high_t lgt8_100_000w5[8][8] = {
+  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
+  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
+  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
+  { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 },
+  { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 },
+  { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 },
+  { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 },
+  { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 },
+};
+
+// LGT8 name: lgt8_060_000w5
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
+static const tran_high_t lgt8_060_000w5[8][8] = {
+  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
+  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
+  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
+  { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 },
+  { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 },
+  { 19547, 569, -19303, -8852, 15505, 0, 0, 0 },
+  { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 },
+  { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 },
+};
+
+// LGT8 name: lgt8_000w5
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
+static const tran_high_t lgt8_000w5[8][8] = {
+  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
+  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
+  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
+  { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 },
+  { 19710, 12181, 0, -12181, -19710, 0, 0, 0 },
+  { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 },
+  { 12181, -19710, 0, 19710, -12181, 0, 0, 0 },
+  { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 },
+};
+
+// LGT8 name: lgt8_150_000w4
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_150_000w4[8][8] = {
+  { 5655, 13343, 19159, 22286, 0, 0, 0, 0 },
+  { 15706, 21362, 2667, -19068, 0, 0, 0, 0 },
+  { 21580, -1459, -20752, 13238, 0, 0, 0, 0 },
+  { 18148, -20910, 16399, -6183, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
+  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
+  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
+  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
+};
+
+// LGT8 name: lgt8_100_000w4
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_100_000w4[8][8] = {
+  { 7472, 14042, 18919, 21513, 0, 0, 0, 0 },
+  { 18919, 18919, 0, -18919, 0, 0, 0, 0 },
+  { 21513, -7472, -18919, 14042, 0, 0, 0, 0 },
+  { 14042, -21513, 18919, -7472, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
+  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
+  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
+  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
+};
+
+// LGT8 name: lgt8_060_000w4
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_060_000w4[8][8] = {
+  { 9858, 14855, 18470, 20365, 0, 0, 0, 0 },
+  { 21127, 15855, -2886, -19175, 0, 0, 0, 0 },
+  { 19935, -11679, -17764, 14980, 0, 0, 0, 0 },
+  { 11525, -21570, 20217, -8180, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
+  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
+  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
+  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
+};
+
+// LGT8 name: lgt8_000w4
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_000w4[8][8] = {
+  { 16384, 16384, 16384, 16384, 0, 0, 0, 0 },
+  { 21407, 8867, -8867, -21407, 0, 0, 0, 0 },
+  { 16384, -16384, -16384, 16384, 0, 0, 0, 0 },
+  { 8867, -21407, 21407, -8867, 0, 0, 0, 0 },
+  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
+  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
+  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
+  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
+};
+
+// LGT8 name: lgt8_150_000w3
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_150_000w3[8][8] = {
+  { 8473, 19144, 25209, 0, 0, 0, 0, 0 },
+  { 21942, 15257, -18961, 0, 0, 0, 0, 0 },
+  { 22815, -21783, 8874, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
+  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
+  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
+  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
+  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
+};
+
+// LGT8 name: lgt8_100_000w3
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_100_000w3[8][8] = {
+  { 10747, 19366, 24149, 0, 0, 0, 0, 0 },
+  { 24149, 10747, -19366, 0, 0, 0, 0, 0 },
+  { 19366, -24149, 10747, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
+  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
+  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
+  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
+  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
+};
+
+// LGT8 name: lgt8_060_000w3
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_060_000w3[8][8] = {
+  { 13363, 19452, 22733, 0, 0, 0, 0, 0 },
+  { 24815, 6704, -20323, 0, 0, 0, 0, 0 },
+  { 16715, -25503, 11997, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
+  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
+  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
+  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
+  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
+};
+
+// LGT8 name: lgt8_000w3
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_000w3[8][8] = {
+  { 18919, 18919, 18919, 0, 0, 0, 0, 0 },
+  { 23170, 0, -23170, 0, 0, 0, 0, 0 },
+  { 13377, -26755, 13377, 0, 0, 0, 0, 0 },
+  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
+  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
+  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
+  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
+  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
+};
+
+// LGT8 name: lgt8_150_000w2
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_150_000w2[8][8] = {
+  { 14654, 29309, 0, 0, 0, 0, 0, 0 },
+  { 29309, -14654, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
+  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
+  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
+  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
+  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
+  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
+};
+
+// LGT8 name: lgt8_100_000w2
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_100_000w2[8][8] = {
+  { 17227, 27874, 0, 0, 0, 0, 0, 0 },
+  { 27874, -17227, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
+  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
+  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
+  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
+  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
+  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
+};
+
+// LGT8 name: lgt8_060_000w2
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_060_000w2[8][8] = {
+  { 19560, 26290, 0, 0, 0, 0, 0, 0 },
+  { 26290, -19560, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
+  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
+  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
+  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
+  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
+  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
+};
+
+// LGT8 name: lgt8_000w2
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_000w2[8][8] = {
+  { 23170, 23170, 0, 0, 0, 0, 0, 0 },
+  { 23170, -23170, 0, 0, 0, 0, 0, 0 },
+  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
+  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
+  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
+  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
+  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
+  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
+};
+
+// LGT8 name: lgt8_150_000w1
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_150_000w1[8][8] = {
+  { 32768, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
+  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
+  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
+  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
+  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
+  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
+  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
+};
+
+// LGT8 name: lgt8_100_000w1
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_100_000w1[8][8] = {
+  { 32768, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
+  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
+  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
+  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
+  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
+  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
+  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
+};
+
+// LGT8 name: lgt8_060_000w1
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_060_000w1[8][8] = {
+  { 32768, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
+  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
+  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
+  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
+  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
+  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
+  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
+};
+
+// LGT8 name: lgt8_000w1
+// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_000w1[8][8] = {
+  { 32768, 0, 0, 0, 0, 0, 0, 0 },
+  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
+  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
+  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
+  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
+  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
+  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
+  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
+};
+
+// LGT8 name: lgt8_060
+// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_060[8][8] = {
+  { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 },
+  { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 },
+  { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 },
+  { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 },
+  { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 },
+  { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 },
+  { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 },
+  { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 },
+};
+
+// LGT8 name: lgt8_100
+// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_100[8][8] = {
+  { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 },
+  { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 },
+  { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 },
+  { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 },
+  { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 },
+  { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 },
+  { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 },
+  { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 },
+};
+#endif  // CONFIG_LGT_FROM_PRED
+
+#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
 // LGT4 name: lgt4_170
 // Self loops: 1.700, 0.000, 0.000, 0.000
 // Edges: 1.000, 1.000, 1.000
@ -118,18 +707,14 @@ static const tran_high_t lgt4_170[4][4] = {
  { 14138, -14420, 10663, -3920 },
 };

-// LGT8 name: lgt8_150
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150[8][8] = {
-  { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 },
-  { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 },
-  { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 },
-  { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 },
-  { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 },
-  { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 },
-  { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 },
-  { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 },
+// LGT4 name: lgt4_140
+// Self loops: 1.400, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000
+static const tran_high_t lgt4_140[4][4] = {
+  { 4206, 9518, 13524, 15674 },
+  { 11552, 14833, 1560, -13453 },
+  { 15391, -1906, -14393, 9445 },
+  { 12201, -14921, 12016, -4581 },
 };

 // LGT8 name: lgt8_170
@ -145,5 +730,19 @@ static const tran_high_t lgt8_170[8][8] = {
  { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 },
  { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 },
 };
-#endif  // CONFIG_LGT
+
+// LGT8 name: lgt8_150
+// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
+// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
+static const tran_high_t lgt8_150[8][8] = {
+  { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 },
+  { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 },
+  { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 },
+  { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 },
+  { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 },
+  { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 },
+  { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 },
+  { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 },
+};
+#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
 #endif  // AOM_DSP_TXFM_COMMON_H_
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@ -256,7 +256,13 @@ VARIANCES(4, 16)
 VARIANCES(16, 4)
 VARIANCES(8, 32)
 VARIANCES(32, 8)
-#endif
+VARIANCES(16, 64)
+VARIANCES(64, 16)
+#if CONFIG_EXT_PARTITION
+VARIANCES(32, 128)
+VARIANCES(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES

 GET_VAR(16, 16)
 GET_VAR(8, 8)
@ -661,7 +667,13 @@ HIGHBD_VARIANCES(4, 16)
 HIGHBD_VARIANCES(16, 4)
 HIGHBD_VARIANCES(8, 32)
 HIGHBD_VARIANCES(32, 8)
-#endif
+HIGHBD_VARIANCES(16, 64)
+HIGHBD_VARIANCES(64, 16)
+#if CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(32, 128)
+HIGHBD_VARIANCES(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES

 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
@ -761,7 +773,7 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
 }
 #endif  // CONFIG_HIGHBITDEPTH

-#if CONFIG_AV1 && CONFIG_EXT_INTER
+#if CONFIG_AV1
 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                          int height, const uint8_t *ref, int ref_stride,
                          const uint8_t *mask, int mask_stride,
@ -848,7 +860,13 @@ MASK_SUBPIX_VAR(4, 16)
 MASK_SUBPIX_VAR(16, 4)
 MASK_SUBPIX_VAR(8, 32)
 MASK_SUBPIX_VAR(32, 8)
-#endif
+MASK_SUBPIX_VAR(16, 64)
+MASK_SUBPIX_VAR(64, 16)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR(32, 128)
+MASK_SUBPIX_VAR(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES

 #if CONFIG_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
@ -985,9 +1003,15 @@ HIGHBD_MASK_SUBPIX_VAR(4, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
-#endif
+HIGHBD_MASK_SUBPIX_VAR(16, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR(32, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+#endif  // CONFIG_AV1

 #if CONFIG_AV1 && CONFIG_MOTION_VAR
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
@ -1094,7 +1118,17 @@ OBMC_VAR(8, 32)
 OBMC_SUBPIX_VAR(8, 32)
 OBMC_VAR(32, 8)
 OBMC_SUBPIX_VAR(32, 8)
-#endif
+OBMC_VAR(16, 64)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_VAR(64, 16)
+OBMC_SUBPIX_VAR(64, 16)
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(32, 128)
+OBMC_SUBPIX_VAR(32, 128)
+OBMC_VAR(128, 32)
+OBMC_SUBPIX_VAR(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES

 #if CONFIG_HIGHBITDEPTH
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
@ -1287,6 +1321,16 @@ HIGHBD_OBMC_VAR(8, 32)
 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
 HIGHBD_OBMC_VAR(32, 8)
 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
-#endif
+HIGHBD_OBMC_VAR(16, 64)
+HIGHBD_OBMC_SUBPIX_VAR(16, 64)
+HIGHBD_OBMC_VAR(64, 16)
+HIGHBD_OBMC_SUBPIX_VAR(64, 16)
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(32, 128)
+HIGHBD_OBMC_SUBPIX_VAR(32, 128)
+HIGHBD_OBMC_VAR(128, 32)
+HIGHBD_OBMC_SUBPIX_VAR(128, 32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@ -54,7 +54,7 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)(
    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
    int b_stride, unsigned int *sse, const uint8_t *second_pred);

-#if CONFIG_AV1 && CONFIG_EXT_INTER
+#if CONFIG_AV1
 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
                                            const uint8_t *ref, int ref_stride,
                                            const uint8_t *second_pred,
@ -64,7 +64,7 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+#endif  // CONFIG_AV1

 #if CONFIG_AV1 && CONFIG_MOTION_VAR
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
@ -90,10 +90,8 @@ typedef struct aom_variance_vtable {
  aom_sad_multi_fn_t sdx3f;
  aom_sad_multi_fn_t sdx8f;
  aom_sad_multi_d_fn_t sdx4df;
-#if CONFIG_EXT_INTER
  aom_masked_sad_fn_t msdf;
  aom_masked_subpixvariance_fn_t msvf;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
  aom_obmc_sad_fn_t osdf;
  aom_obmc_variance_fn_t ovf;
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@ -346,9 +346,15 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
    psraw         m0, 7
    psraw         m4, 7
 %ifidn %1, h8_add_src
+%if ARCH_X86=1 && CONFIG_PIC=1
+    pcmpeqb       m2, m2                  ;all ones
+    psrlw         m2, 8                   ;even_byte_mask
+%else
+    mova          m2, [GLOBAL(even_byte_mask)]
+%endif
    movu          m5, [srcq]
    mova          m7, m5
-    pand          m5, [even_byte_mask]
+    pand          m5, m2
    psrlw         m7, 8
    paddsw        m0, m5
    paddsw        m4, m7
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_COMMON_AVX2_H
+#define AOM_DSP_X86_COMMON_AVX2_H
+
+#include <immintrin.h>
+
+#include "./aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
@ -15,21 +15,21 @@
 #include "./aom_config.h"

 static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-#if CONFIG_HIGHBITDEPTH
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);

-  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+    __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+    __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);

-  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+    __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+    __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);

-  _mm256_storeu_si256((__m256i *)out, y0);
-  _mm256_storeu_si256((__m256i *)(out + 8), y1);
-#else
-  _mm256_storeu_si256((__m256i *)out, *coeff);
-#endif
+    _mm256_storeu_si256((__m256i *)out, y0);
+    _mm256_storeu_si256((__m256i *)(out + 8), y1);
+  } else {
+    _mm256_storeu_si256((__m256i *)out, *coeff);
+  }
 }

 #endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32(
 }

 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  _mm_store_si128((__m128i *)(dst_ptr), out0);
-  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-#else
-  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-#endif  // CONFIG_HIGHBITDEPTH
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+    _mm_store_si128((__m128i *)(dst_ptr), out0);
+    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  } else {
+    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+  }
 }

 static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// D45E_PRED
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
+                                 const __m256i *z) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a = _mm256_avg_epu16(*x, *z);
+  const __m256i b =
+      _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
+  return _mm256_avg_epu16(b, *y);
+}
+
+static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
+                            const __m256i *a2, uint16_t **dst,
+                            ptrdiff_t stride) {
+  const __m256i y = avg3_epu16(a0, a1, a2);
+  _mm256_storeu_si256((__m256i *)*dst, y);
+  *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 9);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
+  x0 = _mm256_insert_epi16(x0, above[23], 15);
+  const __m256i y = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 15);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+  d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  x2 = _mm256_insert_epi16(x2, above[31], 15);
+  const __m256i y = avg3_epu16(&x0, &x1, &x2);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 33);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+  x0 = _mm256_insert_epi16(x0, above[47], 15);
+  const __m256i y = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+  uint16_t *dst1 = dst;
+  uint16_t *dst2 = dst + 16;
+
+  d45e_w16(&x0, &x1, &x2, &dst1, stride);
+  d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x1, &x2, &x0, &dst1, stride);
+    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x2, &x0, &x1, &dst1, stride);
+    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x0, &x1, &x2, &dst1, stride);
+    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y0, &y1, &y2, &dst2, stride);
+  } while (i < 15);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+  d45e_w16(&x1, &x2, &x0, &dst1, stride);
+  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
+  d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  d45e_w16(&x2, &x0, &x1, &dst1, stride);
+  y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
+  d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i u = avg3_epu16(&x0, &x1, &x2);
+  _mm256_storeu_si256((__m256i *)dst1, u);
+
+  y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
+  y2 = _mm256_insert_epi16(y2, above[47], 15);
+  u = avg3_epu16(&y0, &y1, &y2);
+  _mm256_storeu_si256((__m256i *)dst2, u);
+}
+
+void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+  uint16_t *dst1 = dst;
+  uint16_t *dst2 = dst + 16;
+
+  d45e_w16(&x0, &x1, &x2, &dst1, stride);
+  d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x1, &x2, &x0, &dst1, stride);
+    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x2, &x0, &x1, &dst1, stride);
+    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x0, &x1, &x2, &dst1, stride);
+    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y0, &y1, &y2, &dst2, stride);
+  } while (i < 33);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+  __m256i u = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst1, u);
+
+  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
+  y0 = _mm256_insert_epi16(y0, above[63], 15);
+  u = avg3_epu16(&y1, &y2, &y0);
+  _mm256_storeu_si256((__m256i *)dst2, u);
+}
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
@ -257,200 +257,3 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
  dec             nlines4d
  jnz .loop
  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
-  movd                  m1, [aboveq-2]
-  movq                  m0, [aboveq]
-  pshuflw               m1, m1, 0x0
-  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
-  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
-  ; Get the values to compute the maximum value at this bit depth
-  pcmpeqw               m3, m3
-  movd                  m4, bpsd
-  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
-  psllw                 m3, m4
-  pcmpeqw               m2, m2
-  pxor                  m4, m4         ; min possible value
-  pxor                  m3, m2         ; max possible value
-  mova                  m1, [leftq]
-  pshuflw               m2, m1, 0x0
-  pshuflw               m5, m1, 0x55
-  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
-  paddw                 m2, m0
-  ;Clamp to the bit-depth
-  pminsw                m2, m3
-  pmaxsw                m2, m4
-  ;Store the values
-  movq    [dstq          ], m2
-  movhpd  [dstq+strideq*2], m2
-  lea                 dstq, [dstq+strideq*4]
-  pshuflw               m2, m1, 0xaa
-  pshuflw               m5, m1, 0xff
-  movlhps               m2, m5
-  paddw                 m2, m0
-  ;Clamp to the bit-depth
-  pminsw                m2, m3
-  pmaxsw                m2, m4
-  ;Store the values
-  movq    [dstq          ], m2
-  movhpd  [dstq+strideq*2], m2
-  RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
-  movd                  m1, [aboveq-2]
-  mova                  m0, [aboveq]
-  pshuflw               m1, m1, 0x0
-  ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                  m3, m3
-  pxor                  m4, m4
-  pinsrw                m3, oned, 0
-  pinsrw                m4, bpsd, 0
-  pshuflw               m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  punpcklqdq            m3, m3
-  mov                lineq, -4
-  mova                  m2, m3
-  punpcklqdq            m1, m1
-  psllw                 m3, m4
-  add                leftq, 16
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movd                  m1, [leftq+lineq*4]
-  movd                  m2, [leftq+lineq*4+2]
-  pshuflw               m1, m1, 0x0
-  pshuflw               m2, m2, 0x0
-  punpcklqdq            m1, m1
-  punpcklqdq            m2, m2
-  paddw                 m1, m0
-  paddw                 m2, m0
-  ;Clamp to the bit-depth
-  pminsw                m1, m3
-  pminsw                m2, m3
-  pmaxsw                m1, m4
-  pmaxsw                m2, m4
-  ;Store the values
-  mova      [dstq          ], m1
-  mova      [dstq+strideq*2], m2
-  lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
-  movd                  m2, [aboveq-2]
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  pshuflw               m2, m2, 0x0
-  ; Get the values to compute the maximum value at this bit depth
-  pcmpeqw               m3, m3
-  movd                  m4, bpsd
-  punpcklqdq            m2, m2
-  psllw                 m3, m4
-  pcmpeqw               m5, m5
-  pxor                  m4, m4         ; min possible value
-  pxor                  m3, m5         ; max possible value
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -8
-  psubw                 m0, m2
-  psubw                 m1, m2
-.loop:
-  movd                  m7, [leftq]
-  pshuflw               m5, m7, 0x0
-  pshuflw               m2, m7, 0x55
-  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
-  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
-  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
-  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
-  pminsw                m6, m3
-  pminsw                m5, m3
-  pmaxsw                m6, m4         ; Clamp to the bit-depth
-  pmaxsw                m5, m4
-  mova   [dstq           ], m6
-  mova   [dstq        +16], m5
-  paddw                 m6, m2, m0
-  paddw                 m2, m1
-  pminsw                m6, m3
-  pminsw                m2, m3
-  pmaxsw                m6, m4
-  pmaxsw                m2, m4
-  mova   [dstq+strideq*2 ], m6
-  mova [dstq+strideq*2+16], m2
-  lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  lea                leftq, [leftq+4]
-
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
-  movd                  m0, [aboveq-2]
-  mova                  m1, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [aboveq+32]
-  mova                  m4, [aboveq+48]
-  pshuflw               m0, m0, 0x0
-  ; Get the values to compute the maximum value at this bit depth
-  pcmpeqw               m5, m5
-  movd                  m6, bpsd
-  psllw                 m5, m6
-  pcmpeqw               m7, m7
-  pxor                  m6, m6         ; min possible value
-  pxor                  m5, m7         ; max possible value
-  punpcklqdq            m0, m0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -16
-  psubw                 m1, m0
-  psubw                 m2, m0
-  psubw                 m3, m0
-  psubw                 m4, m0
-.loop:
-  movd                  m7, [leftq]
-  pshuflw               m7, m7, 0x0
-  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
-  paddw                 m0, m7, m1
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq           ], m0
-  paddw                 m0, m7, m2
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq        +16], m0
-  paddw                 m0, m7, m3
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq        +32], m0
-  paddw                 m0, m7, m4
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq        +48], m0
-  movd                  m7, [leftq+2]
-  pshuflw               m7, m7, 0x0
-  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
-  paddw                 m0, m7, m1
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq+strideq*2 ], m0
-  paddw                 m0, m7, m2
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq+strideq*2+16], m0
-  paddw                 m0, m7, m3
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq+strideq*2+32], m0
-  paddw                 m0, m7, m4
-  pminsw                m0, m5
-  pmaxsw                m0, m6
-  mova   [dstq+strideq*2+48], m0
-  lea                 dstq, [dstq+strideq*4]
-  lea                leftq, [leftq+4]
-  inc                lineq
-  jnz .loop
-  REP_RET
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
+  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
+};
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+  *a = _mm_shuffle_epi8(*a, *rotrw);
+  return *a;
+}
+
+void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i IXABCDEF =
+      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+  __m128i rowa = avg2;
+  __m128i rowb = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; i += 2) {
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, rowb);
+    dst += stride;
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+  }
+}
+
+void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_srli_si128(L1, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      dst += stride;
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+  const __m128i L3_ = _mm_srli_si128(L3, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowa_2 = avg2_2;
+  __m128i rowa_3 = avg2_3;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i rowb_2 = avg3_2;
+  __m128i rowb_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+      dst += stride;
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  __m128i rowa = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; ++i) {
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+  }
+}
+
+void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_srli_si128(B1, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+    }
+  }
+}
+
+void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+  const __m128i C3 = _mm_srli_si128(B3, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i rowa_2 = avg3_2;
+  __m128i rowa_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+    }
+  }
+}
+
+void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+  const __m128i row0 =
+      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+  const __m128i row1 =
+      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+  const __m128i row2 =
+      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+  const __m128i row4 =
+      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+  const __m128i row5 =
+      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+  const __m128i row6 =
+      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, row0);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row2);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row4);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row5);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row6);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row7);
+}
+
+void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_srli_si128(A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_srli_si128(A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i avg2_avg3_left[2][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+  for (j = 0; j < 2; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+    }
+  }
+}
+
+void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_srli_si128(A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_srli_si128(A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i row_2 = avg3_2;
+  __m128i row_3 = avg3_3;
+  __m128i avg2_avg3_left[4][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+    }
+  }
+}
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+                             const uint8_t *t, int bd, __m256i *blt,
+                             __m256i *lt, __m256i *thr) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+  __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  *blt = _mm256_slli_epi16(y, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  *lt = _mm256_slli_epi16(y, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  *thr = _mm256_slli_epi16(y, shift);
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+                                     __m256i *p, __m256i *q) {
+  int i;
+  for (i = 0; i < size; i++) {
+    p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch));
+    q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch));
+  }
+}
+
+static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q,
+                                   const __m256i *t, __m256i *hev) {
+  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0]));
+  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0]));
+  __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm256_subs_epu16(h, *t);
+
+  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+  const __m256i zero = _mm256_setzero_si256();
+  *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff);
+}
+
+static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q,
+                                      const __m256i *l, const __m256i *bl,
+                                      __m256i *mask) {
+  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0]));
+  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1]));
+  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+  __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff);
+  max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one));
+
+  int i;
+  for (i = 1; i < 4; ++i) {
+    max = _mm256_max_epi16(max,
+                           _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1])));
+    max = _mm256_max_epi16(max,
+                           _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1])));
+  }
+  max = _mm256_subs_epu16(max, *l);
+  *mask = _mm256_cmpeq_epi16(max, zero);  // return ~mask
+}
+
+static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p,
+                                      const __m256i *q, int bd, int start,
+                                      int end, __m256i *flat) {
+  __m256i max = _mm256_setzero_si256();
+  int i;
+  for (i = start; i < end; ++i) {
+    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0])));
+    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0])));
+  }
+
+  __m256i ft;
+  if (bd == 8)
+    ft = _mm256_subs_epu16(max, *th);
+  else if (bd == 10)
+    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2));
+  else  // bd == 12
+    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4));
+
+  const __m256i zero = _mm256_setzero_si256();
+  *flat = _mm256_cmpeq_epi16(ft, zero);
+}
+
+// Note:
+//  Access p[3-1], p[0], and q[3-1], q[0]
+static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p,
+                                     const __m256i *q, __m256i *flat, int bd) {
+  // check the distance 1,2,3 against 0
+  flat_mask_internal(th, p, q, bd, 1, 4, flat);
+}
+
+// Note:
+//  access p[7-4], p[0], and q[7-4], q[0]
+static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p,
+                                     const __m256i *q, __m256i *flat, int bd) {
+  flat_mask_internal(th, p, q, bd, 4, 8, flat);
+}
+
+static INLINE void pixel_clamp(const __m256i *min, const __m256i *max,
+                               __m256i *pixel) {
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(*pixel, *max);
+  clamped = _mm256_andnot_si256(mask, *pixel);
+  mask = _mm256_and_si256(mask, *max);
+  clamped = _mm256_or_si256(mask, clamped);
+
+  mask = _mm256_cmpgt_epi16(clamped, *min);
+  clamped = _mm256_and_si256(mask, clamped);
+  mask = _mm256_andnot_si256(mask, *min);
+  *pixel = _mm256_or_si256(clamped, mask);
+}
+
+static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
+                                  const __m256i *th, int bd, __m256i *ps,
+                                  __m256i *qs) {
+  __m256i t80;
+  if (bd == 8)
+    t80 = _mm256_set1_epi16(0x80);
+  else if (bd == 10)
+    t80 = _mm256_set1_epi16(0x200);
+  else  // bd == 12
+    t80 = _mm256_set1_epi16(0x800);
+
+  __m256i ps0 = _mm256_subs_epi16(p[0], t80);
+  __m256i ps1 = _mm256_subs_epi16(p[1], t80);
+  __m256i qs0 = _mm256_subs_epi16(q[0], t80);
+  __m256i qs1 = _mm256_subs_epi16(q[1], t80);
+
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i pmax = _mm256_subs_epi16(
+      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+  __m256i filter = _mm256_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filter);
+
+  __m256i hev;
+  highbd_hev_mask(p, q, th, &hev);
+  filter = _mm256_and_si256(filter, hev);
+
+  const __m256i x = _mm256_subs_epi16(qs0, ps0);
+  filter = _mm256_adds_epi16(filter, x);
+  filter = _mm256_adds_epi16(filter, x);
+  filter = _mm256_adds_epi16(filter, x);
+  pixel_clamp(&pmin, &pmax, &filter);
+  filter = _mm256_and_si256(filter, *mask);
+
+  const __m256i t3 = _mm256_set1_epi16(3);
+  const __m256i t4 = _mm256_set1_epi16(4);
+
+  __m256i filter1 = _mm256_adds_epi16(filter, t4);
+  __m256i filter2 = _mm256_adds_epi16(filter, t3);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  filter1 = _mm256_srai_epi16(filter1, 3);
+  filter2 = _mm256_srai_epi16(filter2, 3);
+
+  qs0 = _mm256_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &qs0);
+  ps0 = _mm256_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &ps0);
+
+  qs[0] = _mm256_adds_epi16(qs0, t80);
+  ps[0] = _mm256_adds_epi16(ps0, t80);
+
+  filter = _mm256_adds_epi16(filter1, one);
+  filter = _mm256_srai_epi16(filter, 1);
+  filter = _mm256_andnot_si256(hev, filter);
+
+  qs1 = _mm256_subs_epi16(qs1, filter);
+  pixel_clamp(&pmin, &pmax, &qs1);
+  ps1 = _mm256_adds_epi16(ps1, filter);
+  pixel_clamp(&pmin, &pmax, &ps1);
+
+  qs[1] = _mm256_adds_epi16(qs1, t80);
+  ps[1] = _mm256_adds_epi16(ps1, t80);
+}
+#endif  // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
+
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
+                                            const uint8_t *blt,
+                                            const uint8_t *lt,
+                                            const uint8_t *thr, int bd) {
+  aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
+}
+
+void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
+                                          const uint8_t *blt, const uint8_t *lt,
+                                          const uint8_t *thr, int bd) {
+  aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
+#else
+void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
+                                            const uint8_t *blt,
+                                            const uint8_t *lt,
+                                            const uint8_t *thr, int bd) {
+  __m256i blimit, limit, thresh;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
+
+  __m256i p[8], q[8];
+  load_highbd_pixel(s, 8, pitch, p, q);
+
+  __m256i mask;
+  highbd_filter_mask(p, q, &limit, &blimit, &mask);
+
+  __m256i flat, flat2;
+  const __m256i one = _mm256_set1_epi16(1);
+  highbd_flat_mask4(&one, p, q, &flat, bd);
+  highbd_flat_mask5(&one, p, q, &flat2, bd);
+
+  flat = _mm256_and_si256(flat, mask);
+  flat2 = _mm256_and_si256(flat2, flat);
+
+  __m256i ps[2], qs[2];
+  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
+
+  // flat and wide flat calculations
+  __m256i flat_p[3], flat_q[3];
+  __m256i flat2_p[7], flat2_q[7];
+  {
+    const __m256i eight = _mm256_set1_epi16(8);
+    const __m256i four = _mm256_set1_epi16(4);
+
+    __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]),
+                                     _mm256_add_epi16(p[4], p[3]));
+    __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]),
+                                     _mm256_add_epi16(q[4], q[3]));
+
+    __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1]));
+    sum_p = _mm256_add_epi16(sum_p, sum_lp);
+
+    __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1]));
+    sum_q = _mm256_add_epi16(sum_q, sum_lq);
+    sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q));
+    sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq));
+
+    flat2_p[0] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4);
+    flat2_q[0] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4);
+    flat_p[0] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3);
+    flat_q[0] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3);
+
+    __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]);
+    __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]);
+    __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]);
+    __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]);
+
+    sum_q = _mm256_sub_epi16(sum_p, p[6]);
+    sum_p = _mm256_sub_epi16(sum_p, q[6]);
+    flat2_p[1] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4);
+    flat2_q[1] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4);
+
+    sum_lq = _mm256_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm256_sub_epi16(sum_lp, q[2]);
+    flat_p[1] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3);
+    flat_q[1] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3);
+
+    sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
+    sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
+    sum_p3 = _mm256_add_epi16(sum_p3, p[3]);
+    sum_q3 = _mm256_add_epi16(sum_q3, q[3]);
+
+    sum_p = _mm256_sub_epi16(sum_p, q[5]);
+    sum_q = _mm256_sub_epi16(sum_q, p[5]);
+    flat2_p[2] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4);
+    flat2_q[2] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4);
+
+    sum_lp = _mm256_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm256_sub_epi16(sum_lq, p[1]);
+    flat_p[2] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3);
+    flat_q[2] = _mm256_srli_epi16(
+        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3);
+
+    int i;
+    for (i = 3; i < 7; ++i) {
+      sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
+      sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
+      sum_p = _mm256_sub_epi16(sum_p, q[7 - i]);
+      sum_q = _mm256_sub_epi16(sum_q, p[7 - i]);
+      flat2_p[i] = _mm256_srli_epi16(
+          _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4);
+      flat2_q[i] = _mm256_srli_epi16(
+          _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4);
+    }
+  }
+
+  // highbd_filter8
+  p[2] = _mm256_andnot_si256(flat, p[2]);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p[2] = _mm256_and_si256(flat, flat_p[2]);
+  //  when (flat && mask)
+  p[2] = _mm256_or_si256(p[2], flat_p[2]);  // full list of p2 values
+  q[2] = _mm256_andnot_si256(flat, q[2]);
+  flat_q[2] = _mm256_and_si256(flat, flat_q[2]);
+  q[2] = _mm256_or_si256(q[2], flat_q[2]);  // full list of q2 values
+
+  int i;
+  for (i = 1; i >= 0; i--) {
+    ps[i] = _mm256_andnot_si256(flat, ps[i]);
+    flat_p[i] = _mm256_and_si256(flat, flat_p[i]);
+    p[i] = _mm256_or_si256(ps[i], flat_p[i]);
+    qs[i] = _mm256_andnot_si256(flat, qs[i]);
+    flat_q[i] = _mm256_and_si256(flat, flat_q[i]);
+    q[i] = _mm256_or_si256(qs[i], flat_q[i]);
+  }
+
+  // highbd_filter16
+
+  for (i = 6; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    p[i] = _mm256_andnot_si256(flat2, p[i]);
+    flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]);
+    //  get values for when (flat2 && flat && mask)
+    p[i] = _mm256_or_si256(p[i], flat2_p[i]);  // full list of p values
+
+    q[i] = _mm256_andnot_si256(flat2, q[i]);
+    flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]);
+    q[i] = _mm256_or_si256(q[i], flat2_q[i]);
+    _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]);
+    _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]);
+  }
+}
+
+static INLINE void highbd_transpose16x16(uint16_t *src, int src_p,
+                                         uint16_t *dst, int dst_p) {
+  __m256i x[16];
+  int i;
+  for (i = 0; i < 16; ++i) {
+    x[i] = _mm256_loadu_si256((const __m256i *)src);
+    src += src_p;
+  }
+  mm256_transpose_16x16(x, x);
+  for (i = 0; i < 16; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, x[i]);
+    dst += dst_p;
+  }
+}
+
+void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+  //  Transpose 16x16
+  highbd_transpose16x16(s - 8, p, t_dst, 16);
+
+  //  Loop filtering
+  aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit,
+                                         thresh, bd);
+
+  //  Transpose back
+  highbd_transpose16x16(t_dst, 16, s - 8, p);
+}
+
+static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0,
+                                  const uint8_t *t0, const uint8_t *b1,
+                                  const uint8_t *l1, const uint8_t *t1, int bd,
+                                  __m256i *blt, __m256i *lt, __m256i *thr) {
+  const __m128i z128 = _mm_setzero_si128();
+  const __m128i blimit0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128);
+  const __m128i limit0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128);
+  const __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128);
+  const __m128i blimit1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128);
+  const __m128i limit1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128);
+  const __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128);
+
+  *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1);
+  *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1);
+  *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1);
+
+  int shift = bd - 8;
+  *blt = _mm256_slli_epi16(*blt, shift);
+  *lt = _mm256_slli_epi16(*lt, shift);
+  *thr = _mm256_slli_epi16(*thr, shift);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
+  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
+  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
+  __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p));
+  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
+  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
+
+  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
+  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
+
+  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
+  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
+
+  __m256i blimit, limit, thresh;
+  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit, &limit, &thresh);
+
+  __m256i t80, tff80, tffe0, t1f, t7f;
+  if (bd == 8) {
+    t80 = _mm256_set1_epi16(0x80);
+    tff80 = _mm256_set1_epi16(0xff80);
+    tffe0 = _mm256_set1_epi16(0xffe0);
+    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8);
+    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2);
+    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2);
+    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2);
+    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6);
+    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4);
+    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4);
+    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4);
+    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4);
+    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4);
+  }
+
+  __m256i ps1 =
+      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80);
+  __m256i ps0 =
+      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80);
+  __m256i qs0 =
+      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80);
+  __m256i qs1 =
+      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80);
+
+  // filter_mask and hev_mask
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+  __m256i hev = _mm256_subs_epu16(flat, thresh);
+  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+  __m256i mask =
+      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  const __m256i one = _mm256_set1_epi16(1);
+  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
+  mask = _mm256_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  __m256i work = _mm256_max_epi16(
+      _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)),
+      _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3)));
+  mask = _mm256_max_epi16(work, mask);
+  work = _mm256_max_epi16(
+      _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)),
+      _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3)));
+  mask = _mm256_max_epi16(work, mask);
+  mask = _mm256_subs_epu16(mask, limit);
+  mask = _mm256_cmpeq_epi16(mask, zero);
+
+  // filter4
+  const __m256i pmax = _mm256_subs_epi16(
+      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+  const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+  __m256i filt = _mm256_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm256_and_si256(filt, hev);
+  __m256i work_a = _mm256_subs_epi16(qs0, ps0);
+  filt = _mm256_adds_epi16(filt, work_a);
+  filt = _mm256_adds_epi16(filt, work_a);
+  filt = _mm256_adds_epi16(filt, work_a);
+  pixel_clamp(&pmin, &pmax, &filt);
+
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm256_and_si256(filt, mask);
+
+  const __m256i t4 = _mm256_set1_epi16(4);
+  const __m256i t3 = _mm256_set1_epi16(3);
+
+  __m256i filter1 = _mm256_adds_epi16(filt, t4);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  __m256i filter2 = _mm256_adds_epi16(filt, t3);
+  pixel_clamp(&pmin, &pmax, &filter2);
+
+  // Filter1 >> 3
+  work_a = _mm256_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm256_srli_epi16(filter1, 3);
+  work_a = _mm256_and_si256(work_a, tffe0);    // sign bits for the values < 0
+  filter1 = _mm256_and_si256(filter1, t1f);    // clamp the range
+  filter1 = _mm256_or_si256(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm256_cmpgt_epi16(zero, filter2);
+  filter2 = _mm256_srli_epi16(filter2, 3);
+  work_a = _mm256_and_si256(work_a, tffe0);
+  filter2 = _mm256_and_si256(filter2, t1f);
+  filter2 = _mm256_or_si256(filter2, work_a);
+
+  // filt >> 1
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  filt = _mm256_adds_epi16(filter1, one);
+  work_a = _mm256_cmpgt_epi16(zero, filt);
+  filt = _mm256_srli_epi16(filt, 1);
+  work_a = _mm256_and_si256(work_a, tff80);
+  filt = _mm256_and_si256(filt, t7f);
+  filt = _mm256_or_si256(filt, work_a);
+
+  filt = _mm256_andnot_si256(hev, filt);
+
+  filter1 = _mm256_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  q0 = _mm256_adds_epi16(filter1, t80);
+
+  filter1 = _mm256_subs_epi16(qs1, filt);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  q1 = _mm256_adds_epi16(filter1, t80);
+
+  filter2 = _mm256_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  p0 = _mm256_adds_epi16(filter2, t80);
+
+  filter2 = _mm256_adds_epi16(ps1, filt);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  p1 = _mm256_adds_epi16(filter2, t80);
+
+  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
+  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
+  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
+  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+
+  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
+  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
+  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
+  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
+  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
+  __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p));
+
+  __m256i blimit, limit, thresh;
+  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit, &limit, &thresh);
+
+  __m256i t80;
+  if (bd == 8) {
+    t80 = _mm256_set1_epi16(0x80);
+  } else if (bd == 10) {
+    t80 = _mm256_set1_epi16(0x200);
+  } else {  // bd == 12
+    t80 = _mm256_set1_epi16(0x800);
+  }
+
+  __m256i ps1, ps0, qs0, qs1;
+  ps1 = _mm256_subs_epi16(p1, t80);
+  ps0 = _mm256_subs_epi16(p0, t80);
+  qs0 = _mm256_subs_epi16(q0, t80);
+  qs1 = _mm256_subs_epi16(q1, t80);
+
+  // filter_mask and hev_mask
+  __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
+  abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
+
+  abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
+  abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
+  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+  __m256i hev = _mm256_subs_epu16(flat, thresh);
+  const __m256i zero = _mm256_set1_epi16(0);
+  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+  __m256i mask =
+      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+
+  const __m256i one = _mm256_set1_epi16(1);
+  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
+  mask = _mm256_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm256_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)),
+                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q1)));
+  mask = _mm256_max_epi16(work, mask);
+  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)),
+                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q2)));
+  mask = _mm256_max_epi16(work, mask);
+  mask = _mm256_subs_epu16(mask, limit);
+  mask = _mm256_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)),
+                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q0)));
+  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)),
+                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q0)));
+  flat = _mm256_max_epi16(work, flat);
+  flat = _mm256_max_epi16(abs_p1p0, flat);
+  flat = _mm256_max_epi16(abs_q1q0, flat);
+
+  if (bd == 8)
+    flat = _mm256_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4));
+
+  flat = _mm256_cmpeq_epi16(flat, zero);
+  flat = _mm256_and_si256(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+  __m256i workp_a, workp_b, workp_shft;
+  workp_a =
+      _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1));
+  const __m256i four = _mm256_set1_epi16(4);
+  workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0);
+  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+  _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  const __m256i pmax = _mm256_subs_epi16(
+      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+  const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+  __m256i filt, filter1, filter2, work_a;
+  filt = _mm256_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm256_and_si256(filt, hev);
+  work_a = _mm256_subs_epi16(qs0, ps0);
+  filt = _mm256_adds_epi16(filt, work_a);
+  filt = _mm256_adds_epi16(filt, work_a);
+  filt = _mm256_adds_epi16(filt, work_a);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm256_and_si256(filt, mask);
+
+  const __m256i t4 = _mm256_set1_epi16(4);
+  const __m256i t3 = _mm256_set1_epi16(3);
+
+  filter1 = _mm256_adds_epi16(filt, t4);
+  filter2 = _mm256_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  pixel_clamp(&pmin, &pmax, &filter1);
+  filter1 = _mm256_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  pixel_clamp(&pmin, &pmax, &filter2);
+  filter2 = _mm256_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm256_adds_epi16(filter1, one);
+  filt = _mm256_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm256_andnot_si256(hev, filt);
+
+  work_a = _mm256_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &work_a);
+  work_a = _mm256_adds_epi16(work_a, t80);
+  q0 = _mm256_loadu_si256((__m256i *)flat_oq0);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  q0 = _mm256_and_si256(flat, q0);
+  q0 = _mm256_or_si256(work_a, q0);
+
+  work_a = _mm256_subs_epi16(qs1, filt);
+  pixel_clamp(&pmin, &pmax, &work_a);
+  work_a = _mm256_adds_epi16(work_a, t80);
+  q1 = _mm256_loadu_si256((__m256i *)flat_oq1);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  q1 = _mm256_and_si256(flat, q1);
+  q1 = _mm256_or_si256(work_a, q1);
+
+  work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+  q2 = _mm256_loadu_si256((__m256i *)flat_oq2);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  q2 = _mm256_and_si256(flat, q2);
+  q2 = _mm256_or_si256(work_a, q2);
+
+  work_a = _mm256_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &work_a);
+  work_a = _mm256_adds_epi16(work_a, t80);
+  p0 = _mm256_loadu_si256((__m256i *)flat_op0);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  p0 = _mm256_and_si256(flat, p0);
+  p0 = _mm256_or_si256(work_a, p0);
+
+  work_a = _mm256_adds_epi16(ps1, filt);
+  pixel_clamp(&pmin, &pmax, &work_a);
+  work_a = _mm256_adds_epi16(work_a, t80);
+  p1 = _mm256_loadu_si256((__m256i *)flat_op1);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  p1 = _mm256_and_si256(flat, p1);
+  p1 = _mm256_or_si256(work_a, p1);
+
+  work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+  p2 = _mm256_loadu_si256((__m256i *)flat_op2);
+  work_a = _mm256_andnot_si256(flat, work_a);
+  p2 = _mm256_and_si256(flat, p2);
+  p2 = _mm256_or_si256(work_a, p2);
+
+  _mm256_storeu_si256((__m256i *)(s - 3 * p), p2);
+  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
+  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
+  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
+  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
+  _mm256_storeu_si256((__m256i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
--- a/Показать больше
+++ b/Показать больше